diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h
index e2c4830cb964..aa3688718ae7 100644
--- a/include/os/freebsd/spl/sys/sdt.h
+++ b/include/os/freebsd/spl/sys/sdt.h
@@ -1,46 +1,46 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _OPENSOLARIS_SYS_SDT_H_
 #define	_OPENSOLARIS_SYS_SDT_H_
 
 #include_next <sys/sdt.h>
 #ifdef KDTRACE_HOOKS
-/* BEGIN CSTYLED */
 SDT_PROBE_DECLARE(sdt, , , set__error);
 
+/* BEGIN CSTYLED */
 #define	SET_ERROR(err)	({ 					\
 	SDT_PROBE1(sdt, , , set__error, (uintptr_t)err);	\
 	err;							\
 })
 /* END CSTYLED */
 #else
 #define	SET_ERROR(err) (err)
 #endif
 
 #endif	/* _OPENSOLARIS_SYS_SDT_H_ */
diff --git a/lib/libspl/atomic.c b/lib/libspl/atomic.c
index 8cc350710ba0..f61f5fcc47f5 100644
--- a/lib/libspl/atomic.c
+++ b/lib/libspl/atomic.c
@@ -1,400 +1,372 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2009 by Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <atomic.h>
 
 /*
  * These are the void returning variants
  */
 #define	ATOMIC_INC(name, type) \
 	void atomic_inc_##name(volatile type *target)			\
 	{								\
 		(void) __atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST);	\
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_INC(8, uint8_t)
 ATOMIC_INC(16, uint16_t)
 ATOMIC_INC(32, uint32_t)
 ATOMIC_INC(64, uint64_t)
 ATOMIC_INC(uchar, uchar_t)
 ATOMIC_INC(ushort, ushort_t)
 ATOMIC_INC(uint, uint_t)
 ATOMIC_INC(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_DEC(name, type) \
 	void atomic_dec_##name(volatile type *target)			\
 	{								\
 		(void) __atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST);	\
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_DEC(8, uint8_t)
 ATOMIC_DEC(16, uint16_t)
 ATOMIC_DEC(32, uint32_t)
 ATOMIC_DEC(64, uint64_t)
 ATOMIC_DEC(uchar, uchar_t)
 ATOMIC_DEC(ushort, ushort_t)
 ATOMIC_DEC(uint, uint_t)
 ATOMIC_DEC(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_ADD(name, type1, type2) \
 	void atomic_add_##name(volatile type1 *target, type2 bits)	\
 	{								\
 		(void) __atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST); \
 	}
 
 void
 atomic_add_ptr(volatile void *target, ssize_t bits)
 {
 	(void) __atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST);
 }
 
-/* BEGIN CSTYLED */
 ATOMIC_ADD(8, uint8_t, int8_t)
 ATOMIC_ADD(16, uint16_t, int16_t)
 ATOMIC_ADD(32, uint32_t, int32_t)
 ATOMIC_ADD(64, uint64_t, int64_t)
 ATOMIC_ADD(char, uchar_t, signed char)
 ATOMIC_ADD(short, ushort_t, short)
 ATOMIC_ADD(int, uint_t, int)
 ATOMIC_ADD(long, ulong_t, long)
-/* END CSTYLED */
 
 
 #define	ATOMIC_SUB(name, type1, type2) \
 	void atomic_sub_##name(volatile type1 *target, type2 bits)	\
 	{								\
 		(void) __atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST); \
 	}
 
 void
 atomic_sub_ptr(volatile void *target, ssize_t bits)
 {
 	(void) __atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST);
 }
 
-/* BEGIN CSTYLED */
 ATOMIC_SUB(8, uint8_t, int8_t)
 ATOMIC_SUB(16, uint16_t, int16_t)
 ATOMIC_SUB(32, uint32_t, int32_t)
 ATOMIC_SUB(64, uint64_t, int64_t)
 ATOMIC_SUB(char, uchar_t, signed char)
 ATOMIC_SUB(short, ushort_t, short)
 ATOMIC_SUB(int, uint_t, int)
 ATOMIC_SUB(long, ulong_t, long)
-/* END CSTYLED */
 
 
 #define	ATOMIC_OR(name, type) \
 	void atomic_or_##name(volatile type *target, type bits)		\
 	{								\
 		(void) __atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_OR(8, uint8_t)
 ATOMIC_OR(16, uint16_t)
 ATOMIC_OR(32, uint32_t)
 ATOMIC_OR(64, uint64_t)
 ATOMIC_OR(uchar, uchar_t)
 ATOMIC_OR(ushort, ushort_t)
 ATOMIC_OR(uint, uint_t)
 ATOMIC_OR(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_AND(name, type) \
 	void atomic_and_##name(volatile type *target, type bits)	\
 	{								\
 		(void) __atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_AND(8, uint8_t)
 ATOMIC_AND(16, uint16_t)
 ATOMIC_AND(32, uint32_t)
 ATOMIC_AND(64, uint64_t)
 ATOMIC_AND(uchar, uchar_t)
 ATOMIC_AND(ushort, ushort_t)
 ATOMIC_AND(uint, uint_t)
 ATOMIC_AND(ulong, ulong_t)
-/* END CSTYLED */
 
 
 /*
  * New value returning variants
  */
 
 #define	ATOMIC_INC_NV(name, type) \
 	type atomic_inc_##name##_nv(volatile type *target)		\
 	{								\
 		return (__atomic_add_fetch(target, 1, __ATOMIC_SEQ_CST)); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_INC_NV(8, uint8_t)
 ATOMIC_INC_NV(16, uint16_t)
 ATOMIC_INC_NV(32, uint32_t)
 ATOMIC_INC_NV(64, uint64_t)
 ATOMIC_INC_NV(uchar, uchar_t)
 ATOMIC_INC_NV(ushort, ushort_t)
 ATOMIC_INC_NV(uint, uint_t)
 ATOMIC_INC_NV(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_DEC_NV(name, type) \
 	type atomic_dec_##name##_nv(volatile type *target)		\
 	{								\
 		return (__atomic_sub_fetch(target, 1, __ATOMIC_SEQ_CST)); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_DEC_NV(8, uint8_t)
 ATOMIC_DEC_NV(16, uint16_t)
 ATOMIC_DEC_NV(32, uint32_t)
 ATOMIC_DEC_NV(64, uint64_t)
 ATOMIC_DEC_NV(uchar, uchar_t)
 ATOMIC_DEC_NV(ushort, ushort_t)
 ATOMIC_DEC_NV(uint, uint_t)
 ATOMIC_DEC_NV(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_ADD_NV(name, type1, type2) \
 	type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits) \
 	{								\
 		return (__atomic_add_fetch(target, bits, __ATOMIC_SEQ_CST)); \
 	}
 
 void *
 atomic_add_ptr_nv(volatile void *target, ssize_t bits)
 {
 	return (__atomic_add_fetch((void **)target, bits, __ATOMIC_SEQ_CST));
 }
 
-/* BEGIN CSTYLED */
 ATOMIC_ADD_NV(8, uint8_t, int8_t)
 ATOMIC_ADD_NV(16, uint16_t, int16_t)
 ATOMIC_ADD_NV(32, uint32_t, int32_t)
 ATOMIC_ADD_NV(64, uint64_t, int64_t)
 ATOMIC_ADD_NV(char, uchar_t, signed char)
 ATOMIC_ADD_NV(short, ushort_t, short)
 ATOMIC_ADD_NV(int, uint_t, int)
 ATOMIC_ADD_NV(long, ulong_t, long)
-/* END CSTYLED */
 
 
 #define	ATOMIC_SUB_NV(name, type1, type2) \
 	type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits) \
 	{								\
 		return (__atomic_sub_fetch(target, bits, __ATOMIC_SEQ_CST)); \
 	}
 
 void *
 atomic_sub_ptr_nv(volatile void *target, ssize_t bits)
 {
 	return (__atomic_sub_fetch((void **)target, bits, __ATOMIC_SEQ_CST));
 }
 
-/* BEGIN CSTYLED */
 ATOMIC_SUB_NV(8, uint8_t, int8_t)
 ATOMIC_SUB_NV(char, uchar_t, signed char)
 ATOMIC_SUB_NV(16, uint16_t, int16_t)
 ATOMIC_SUB_NV(short, ushort_t, short)
 ATOMIC_SUB_NV(32, uint32_t, int32_t)
 ATOMIC_SUB_NV(int, uint_t, int)
 ATOMIC_SUB_NV(long, ulong_t, long)
 ATOMIC_SUB_NV(64, uint64_t, int64_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_OR_NV(name, type) \
 	type atomic_or_##name##_nv(volatile type *target, type bits)	\
 	{								\
 		return (__atomic_or_fetch(target, bits, __ATOMIC_SEQ_CST)); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_OR_NV(8, uint8_t)
 ATOMIC_OR_NV(16, uint16_t)
 ATOMIC_OR_NV(32, uint32_t)
 ATOMIC_OR_NV(64, uint64_t)
 ATOMIC_OR_NV(uchar, uchar_t)
 ATOMIC_OR_NV(ushort, ushort_t)
 ATOMIC_OR_NV(uint, uint_t)
 ATOMIC_OR_NV(ulong, ulong_t)
-/* END CSTYLED */
 
 
 #define	ATOMIC_AND_NV(name, type) \
 	type atomic_and_##name##_nv(volatile type *target, type bits)	\
 	{								\
 		return (__atomic_and_fetch(target, bits, __ATOMIC_SEQ_CST)); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_AND_NV(8, uint8_t)
 ATOMIC_AND_NV(16, uint16_t)
 ATOMIC_AND_NV(32, uint32_t)
 ATOMIC_AND_NV(64, uint64_t)
 ATOMIC_AND_NV(uchar, uchar_t)
 ATOMIC_AND_NV(ushort, ushort_t)
 ATOMIC_AND_NV(uint, uint_t)
 ATOMIC_AND_NV(ulong, ulong_t)
-/* END CSTYLED */
 
 
 /*
  * If *tgt == exp, set *tgt = des; return old value
  *
  * This may not look right on the first pass (or the sixteenth), but,
  * from https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html:
  * > If they are not equal, the operation is a read
  * > and the current contents of *ptr are written into *expected.
  * And, in the converse case, exp is already *target by definition.
  */
 
 #define	ATOMIC_CAS(name, type) \
 	type atomic_cas_##name(volatile type *target, type exp, type des) \
 	{								\
 		__atomic_compare_exchange_n(target, &exp, des, B_FALSE,	\
 		    __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);		\
 		return (exp);						\
 	}
 
 void *
 atomic_cas_ptr(volatile void *target, void *exp, void *des)
 {
 
 	__atomic_compare_exchange_n((void **)target, &exp, des, B_FALSE,
 	    __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
 	return (exp);
 }
 
-/* BEGIN CSTYLED */
 ATOMIC_CAS(8, uint8_t)
 ATOMIC_CAS(16, uint16_t)
 ATOMIC_CAS(32, uint32_t)
 ATOMIC_CAS(64, uint64_t)
 ATOMIC_CAS(uchar, uchar_t)
 ATOMIC_CAS(ushort, ushort_t)
 ATOMIC_CAS(uint, uint_t)
 ATOMIC_CAS(ulong, ulong_t)
-/* END CSTYLED */
 
 
 /*
  * Swap target and return old value
  */
 
 #define	ATOMIC_SWAP(name, type) \
 	type atomic_swap_##name(volatile type *target, type bits)	\
 	{								\
 		return (__atomic_exchange_n(target, bits, __ATOMIC_SEQ_CST)); \
 	}
 
-/* BEGIN CSTYLED */
 ATOMIC_SWAP(8, uint8_t)
 ATOMIC_SWAP(16, uint16_t)
 ATOMIC_SWAP(32, uint32_t)
 ATOMIC_SWAP(64, uint64_t)
 ATOMIC_SWAP(uchar, uchar_t)
 ATOMIC_SWAP(ushort, ushort_t)
 ATOMIC_SWAP(uint, uint_t)
 ATOMIC_SWAP(ulong, ulong_t)
-/* END CSTYLED */
 
 void *
 atomic_swap_ptr(volatile void *target, void *bits)
 {
 	return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
 }
 
 #ifndef _LP64
 uint64_t
 atomic_load_64(volatile uint64_t *target)
 {
 	return (__atomic_load_n(target, __ATOMIC_RELAXED));
 }
 
 void
 atomic_store_64(volatile uint64_t *target, uint64_t bits)
 {
 	return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
 }
 #endif
 
 int
 atomic_set_long_excl(volatile ulong_t *target, uint_t value)
 {
 	ulong_t bit = 1UL << value;
 	ulong_t old = __atomic_fetch_or(target, bit, __ATOMIC_SEQ_CST);
 	return ((old & bit) ? -1 : 0);
 }
 
 int
 atomic_clear_long_excl(volatile ulong_t *target, uint_t value)
 {
 	ulong_t bit = 1UL << value;
 	ulong_t old = __atomic_fetch_and(target, ~bit, __ATOMIC_SEQ_CST);
 	return ((old & bit) ? 0 : -1);
 }
 
 void
 membar_enter(void)
 {
 	__atomic_thread_fence(__ATOMIC_SEQ_CST);
 }
 
 void
 membar_exit(void)
 {
 	__atomic_thread_fence(__ATOMIC_SEQ_CST);
 }
 
 void
 membar_sync(void)
 {
 	__atomic_thread_fence(__ATOMIC_SEQ_CST);
 }
 
 void
 membar_producer(void)
 {
 	__atomic_thread_fence(__ATOMIC_RELEASE);
 }
 
 void
 membar_consumer(void)
 {
 	__atomic_thread_fence(__ATOMIC_ACQUIRE);
 }
diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c
index 887f7d32df4a..9034873474fe 100644
--- a/module/nvpair/nvpair.c
+++ b/module/nvpair/nvpair.c
@@ -1,3797 +1,3795 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
  * Copyright 2018 RackTop Systems.
  */
 
 /*
  * Links to Illumos.org for more information on Interface Libraries:
  * [1] https://illumos.org/man/3lib/libnvpair
  * [2] https://illumos.org/man/3nvpair/nvlist_alloc
  * [3] https://illumos.org/man/9f/nvlist_alloc
  * [4] https://illumos.org/man/9f/nvlist_next_nvpair
  * [5] https://illumos.org/man/9f/nvpair_value_byte
  */
 
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
 #include <sys/nvpair.h>
 #include <sys/nvpair_impl.h>
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/string.h>
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <sys/mod.h>
 
 #if defined(_KERNEL)
 #include <sys/sunddi.h>
 #include <sys/sysmacros.h>
 #else
 #include <stdarg.h>
 #include <stdlib.h>
 #include <stddef.h>
 #endif
 
 #define	skip_whitespace(p)	while ((*(p) == ' ') || (*(p) == '\t')) (p)++
 
 /*
  * nvpair.c - Provides kernel & userland interfaces for manipulating
  *	name-value pairs.
  *
  * Overview Diagram
  *
  *  +--------------+
  *  |  nvlist_t    |
  *  |--------------|
  *  | nvl_version  |
  *  | nvl_nvflag   |
  *  | nvl_priv    -+-+
  *  | nvl_flag     | |
  *  | nvl_pad      | |
  *  +--------------+ |
  *                   V
  *      +--------------+      last i_nvp in list
  *      | nvpriv_t     |  +--------------------->
  *      |--------------|  |
  *   +--+- nvp_list    |  |   +------------+
  *   |  |  nvp_last   -+--+   + nv_alloc_t |
  *   |  |  nvp_curr    |      |------------|
  *   |  |  nvp_nva    -+----> | nva_ops    |
  *   |  |  nvp_stat    |      | nva_arg    |
  *   |  +--------------+      +------------+
  *   |
  *   +-------+
  *           V
  *   +---------------------+      +-------------------+
  *   |  i_nvp_t            |  +-->|  i_nvp_t          |  +-->
  *   |---------------------|  |   |-------------------|  |
  *   | nvi_next           -+--+   | nvi_next         -+--+
  *   | nvi_prev (NULL)     | <----+ nvi_prev          |
  *   | . . . . . . . . . . |      | . . . . . . . . . |
  *   | nvp (nvpair_t)      |      | nvp (nvpair_t)    |
  *   |  - nvp_size         |      |  - nvp_size       |
  *   |  - nvp_name_sz      |      |  - nvp_name_sz    |
  *   |  - nvp_value_elem   |      |  - nvp_value_elem |
  *   |  - nvp_type         |      |  - nvp_type       |
  *   |  - data ...         |      |  - data ...       |
  *   +---------------------+      +-------------------+
  *
  *
  *
  *   +---------------------+              +---------------------+
  *   |  i_nvp_t            |  +-->    +-->|  i_nvp_t (last)     |
  *   |---------------------|  |       |   |---------------------|
  *   |  nvi_next          -+--+ ... --+   | nvi_next (NULL)     |
  * <-+- nvi_prev           |<-- ...  <----+ nvi_prev            |
  *   | . . . . . . . . .   |              | . . . . . . . . .   |
  *   | nvp (nvpair_t)      |              | nvp (nvpair_t)      |
  *   |  - nvp_size         |              |  - nvp_size         |
  *   |  - nvp_name_sz      |              |  - nvp_name_sz      |
  *   |  - nvp_value_elem   |              |  - nvp_value_elem   |
  *   |  - DATA_TYPE_NVLIST |              |  - nvp_type         |
  *   |  - data (embedded)  |              |  - data ...         |
  *   |    nvlist name      |              +---------------------+
  *   |  +--------------+   |
  *   |  |  nvlist_t    |   |
  *   |  |--------------|   |
  *   |  | nvl_version  |   |
  *   |  | nvl_nvflag   |   |
  *   |  | nvl_priv   --+---+---->
  *   |  | nvl_flag     |   |
  *   |  | nvl_pad      |   |
  *   |  +--------------+   |
  *   +---------------------+
  *
  *
  * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
  * allow value to be aligned on 8 byte boundary
  *
  * name_len is the length of the name string including the null terminator
  * so it must be >= 1
  */
 #define	NVP_SIZE_CALC(name_len, data_len) \
 	(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
 
 static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
 static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
     uint_t nelem, const void *data);
 
 #define	NV_STAT_EMBEDDED	0x1
 #define	EMBEDDED_NVL(nvp)	((nvlist_t *)(void *)NVP_VALUE(nvp))
 #define	EMBEDDED_NVL_ARRAY(nvp)	((nvlist_t **)(void *)NVP_VALUE(nvp))
 
 #define	NVP_VALOFF(nvp)	(NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
 #define	NVPAIR2I_NVP(nvp) \
 	((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
 
 #ifdef _KERNEL
 static const int nvpair_max_recursion = 20;
 #else
 static const int nvpair_max_recursion = 100;
 #endif
 
 static const uint64_t nvlist_hashtable_init_size = (1 << 4);
 
 int
 nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
 {
 	va_list valist;
 	int err = 0;
 
 	nva->nva_ops = nvo;
 	nva->nva_arg = NULL;
 
 	va_start(valist, nvo);
 	if (nva->nva_ops->nv_ao_init != NULL)
 		err = nva->nva_ops->nv_ao_init(nva, valist);
 	va_end(valist);
 
 	return (err);
 }
 
 void
 nv_alloc_reset(nv_alloc_t *nva)
 {
 	if (nva->nva_ops->nv_ao_reset != NULL)
 		nva->nva_ops->nv_ao_reset(nva);
 }
 
 void
 nv_alloc_fini(nv_alloc_t *nva)
 {
 	if (nva->nva_ops->nv_ao_fini != NULL)
 		nva->nva_ops->nv_ao_fini(nva);
 }
 
 nv_alloc_t *
 nvlist_lookup_nv_alloc(nvlist_t *nvl)
 {
 	nvpriv_t *priv;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	return (priv->nvp_nva);
 }
 
 static void *
 nv_mem_zalloc(nvpriv_t *nvp, size_t size)
 {
 	nv_alloc_t *nva = nvp->nvp_nva;
 	void *buf;
 
 	if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
 		memset(buf, 0, size);
 
 	return (buf);
 }
 
 static void
 nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
 {
 	nv_alloc_t *nva = nvp->nvp_nva;
 
 	nva->nva_ops->nv_ao_free(nva, buf, size);
 }
 
 static void
 nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
 {
 	memset(priv, 0, sizeof (nvpriv_t));
 
 	priv->nvp_nva = nva;
 	priv->nvp_stat = stat;
 }
 
 static nvpriv_t *
 nv_priv_alloc(nv_alloc_t *nva)
 {
 	nvpriv_t *priv;
 
 	/*
 	 * nv_mem_alloc() cannot called here because it needs the priv
 	 * argument.
 	 */
 	if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
 		return (NULL);
 
 	nv_priv_init(priv, nva, 0);
 
 	return (priv);
 }
 
 /*
  * Embedded lists need their own nvpriv_t's.  We create a new
  * nvpriv_t using the parameters and allocator from the parent
  * list's nvpriv_t.
  */
 static nvpriv_t *
 nv_priv_alloc_embedded(nvpriv_t *priv)
 {
 	nvpriv_t *emb_priv;
 
 	if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
 		return (NULL);
 
 	nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
 
 	return (emb_priv);
 }
 
 static int
 nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
 {
 	ASSERT3P(priv->nvp_hashtable, ==, NULL);
 	ASSERT0(priv->nvp_nbuckets);
 	ASSERT0(priv->nvp_nentries);
 
 	i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
 	if (tab == NULL)
 		return (ENOMEM);
 
 	priv->nvp_hashtable = tab;
 	priv->nvp_nbuckets = buckets;
 	return (0);
 }
 
 static void
 nvt_tab_free(nvpriv_t *priv)
 {
 	i_nvp_t **tab = priv->nvp_hashtable;
 	if (tab == NULL) {
 		ASSERT0(priv->nvp_nbuckets);
 		ASSERT0(priv->nvp_nentries);
 		return;
 	}
 
 	nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
 
 	priv->nvp_hashtable = NULL;
 	priv->nvp_nbuckets = 0;
 	priv->nvp_nentries = 0;
 }
 
 static uint32_t
 nvt_hash(const char *p)
 {
 	uint32_t g, hval = 0;
 
 	while (*p) {
 		hval = (hval << 4) + *p++;
 		if ((g = (hval & 0xf0000000)) != 0)
 			hval ^= g >> 24;
 		hval &= ~g;
 	}
 	return (hval);
 }
 
 static boolean_t
 nvt_nvpair_match(const nvpair_t *nvp1, const nvpair_t *nvp2, uint32_t nvflag)
 {
 	boolean_t match = B_FALSE;
 	if (nvflag & NV_UNIQUE_NAME_TYPE) {
 		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
 		    NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
 			match = B_TRUE;
 	} else {
 		ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
 		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
 			match = B_TRUE;
 	}
 	return (match);
 }
 
 static nvpair_t *
 nvt_lookup_name_type(const nvlist_t *nvl, const char *name, data_type_t type)
 {
 	const nvpriv_t *priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	ASSERT(priv != NULL);
 
 	i_nvp_t **tab = priv->nvp_hashtable;
 
 	if (tab == NULL) {
 		ASSERT3P(priv->nvp_list, ==, NULL);
 		ASSERT0(priv->nvp_nbuckets);
 		ASSERT0(priv->nvp_nentries);
 		return (NULL);
 	} else {
 		ASSERT(priv->nvp_nbuckets != 0);
 	}
 
 	uint64_t hash = nvt_hash(name);
 	uint64_t index = hash & (priv->nvp_nbuckets - 1);
 
 	ASSERT3U(index, <, priv->nvp_nbuckets);
 	i_nvp_t *entry = tab[index];
 
 	for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
 		if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
 		    (type == DATA_TYPE_DONTCARE ||
 		    NVP_TYPE(&e->nvi_nvp) == type))
 			return (&e->nvi_nvp);
 	}
 	return (NULL);
 }
 
 static nvpair_t *
 nvt_lookup_name(const nvlist_t *nvl, const char *name)
 {
 	return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
 }
 
 static int
 nvt_resize(nvpriv_t *priv, uint32_t new_size)
 {
 	i_nvp_t **tab = priv->nvp_hashtable;
 
 	/*
 	 * Migrate all the entries from the current table
 	 * to a newly-allocated table with the new size by
 	 * re-adjusting the pointers of their entries.
 	 */
 	uint32_t size = priv->nvp_nbuckets;
 	uint32_t new_mask = new_size - 1;
 	ASSERT(ISP2(new_size));
 
 	i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
 	if (new_tab == NULL)
 		return (ENOMEM);
 
 	uint32_t nentries = 0;
 	for (uint32_t i = 0; i < size; i++) {
 		i_nvp_t *next, *e = tab[i];
 
 		while (e != NULL) {
 			next = e->nvi_hashtable_next;
 
 			uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
 			uint32_t index = hash & new_mask;
 
 			e->nvi_hashtable_next = new_tab[index];
 			new_tab[index] = e;
 			nentries++;
 
 			e = next;
 		}
 		tab[i] = NULL;
 	}
 	ASSERT3U(nentries, ==, priv->nvp_nentries);
 
 	nvt_tab_free(priv);
 
 	priv->nvp_hashtable = new_tab;
 	priv->nvp_nbuckets = new_size;
 	priv->nvp_nentries = nentries;
 
 	return (0);
 }
 
 static boolean_t
 nvt_needs_togrow(nvpriv_t *priv)
 {
 	/*
 	 * Grow only when we have more elements than buckets
 	 * and the # of buckets doesn't overflow.
 	 */
 	return (priv->nvp_nentries > priv->nvp_nbuckets &&
 	    (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
 }
 
 /*
  * Allocate a new table that's twice the size of the old one,
  * and migrate all the entries from the old one to the new
  * one by re-adjusting their pointers.
  */
 static int
 nvt_grow(nvpriv_t *priv)
 {
 	uint32_t current_size = priv->nvp_nbuckets;
 	/* ensure we won't overflow */
 	ASSERT3U(UINT32_MAX >> 1, >=, current_size);
 	return (nvt_resize(priv, current_size << 1));
 }
 
 static boolean_t
 nvt_needs_toshrink(nvpriv_t *priv)
 {
 	/*
 	 * Shrink only when the # of elements is less than or
 	 * equal to 1/4 the # of buckets. Never shrink less than
 	 * nvlist_hashtable_init_size.
 	 */
 	ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
 	if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
 		return (B_FALSE);
 	return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
 }
 
 /*
  * Allocate a new table that's half the size of the old one,
  * and migrate all the entries from the old one to the new
  * one by re-adjusting their pointers.
  */
 static int
 nvt_shrink(nvpriv_t *priv)
 {
 	uint32_t current_size = priv->nvp_nbuckets;
 	/* ensure we won't overflow */
 	ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
 	return (nvt_resize(priv, current_size >> 1));
 }
 
 static int
 nvt_remove_nvpair(nvlist_t *nvl, const nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 
 	if (nvt_needs_toshrink(priv)) {
 		int err = nvt_shrink(priv);
 		if (err != 0)
 			return (err);
 	}
 	i_nvp_t **tab = priv->nvp_hashtable;
 
 	const char *name = NVP_NAME(nvp);
 	uint64_t hash = nvt_hash(name);
 	uint64_t index = hash & (priv->nvp_nbuckets - 1);
 
 	ASSERT3U(index, <, priv->nvp_nbuckets);
 	i_nvp_t *bucket = tab[index];
 
 	for (i_nvp_t *prev = NULL, *e = bucket;
 	    e != NULL; prev = e, e = e->nvi_hashtable_next) {
 		if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) {
 			if (prev != NULL) {
 				prev->nvi_hashtable_next =
 				    e->nvi_hashtable_next;
 			} else {
 				ASSERT3P(e, ==, bucket);
 				tab[index] = e->nvi_hashtable_next;
 			}
 			e->nvi_hashtable_next = NULL;
 			priv->nvp_nentries--;
 			break;
 		}
 	}
 
 	return (0);
 }
 
 static int
 nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 
 	/* initialize nvpair table now if it doesn't exist. */
 	if (priv->nvp_hashtable == NULL) {
 		int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
 		if (err != 0)
 			return (err);
 	}
 
 	/*
 	 * if we don't allow duplicate entries, make sure to
 	 * unlink any existing entries from the table.
 	 */
 	if (nvl->nvl_nvflag != 0) {
 		int err = nvt_remove_nvpair(nvl, nvp);
 		if (err != 0)
 			return (err);
 	}
 
 	if (nvt_needs_togrow(priv)) {
 		int err = nvt_grow(priv);
 		if (err != 0)
 			return (err);
 	}
 	i_nvp_t **tab = priv->nvp_hashtable;
 
 	const char *name = NVP_NAME(nvp);
 	uint64_t hash = nvt_hash(name);
 	uint64_t index = hash & (priv->nvp_nbuckets - 1);
 
 	ASSERT3U(index, <, priv->nvp_nbuckets);
 	// cppcheck-suppress nullPointerRedundantCheck
 	i_nvp_t *bucket = tab[index];
 
 	/* insert link at the beginning of the bucket */
 	i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
 	ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
 	new_entry->nvi_hashtable_next = bucket;
 	// cppcheck-suppress nullPointerRedundantCheck
 	tab[index] = new_entry;
 
 	priv->nvp_nentries++;
 	return (0);
 }
 
 static void
 nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
 {
 	nvl->nvl_version = NV_VERSION;
 	nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
 	nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
 	nvl->nvl_flag = 0;
 	nvl->nvl_pad = 0;
 }
 
 uint_t
 nvlist_nvflag(nvlist_t *nvl)
 {
 	return (nvl->nvl_nvflag);
 }
 
 static nv_alloc_t *
 nvlist_nv_alloc(int kmflag)
 {
 #if defined(_KERNEL)
 	switch (kmflag) {
 	case KM_SLEEP:
 		return (nv_alloc_sleep);
 	case KM_NOSLEEP:
 		return (nv_alloc_nosleep);
 	default:
 		return (nv_alloc_pushpage);
 	}
 #else
 	(void) kmflag;
 	return (nv_alloc_nosleep);
 #endif /* _KERNEL */
 }
 
 /*
  * nvlist_alloc - Allocate nvlist.
  */
 int
 nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
 {
 	return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag)));
 }
 
 int
 nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
 {
 	nvpriv_t *priv;
 
 	if (nvlp == NULL || nva == NULL)
 		return (EINVAL);
 
 	if ((priv = nv_priv_alloc(nva)) == NULL)
 		return (ENOMEM);
 
 	if ((*nvlp = nv_mem_zalloc(priv,
 	    NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
 		nv_mem_free(priv, priv, sizeof (nvpriv_t));
 		return (ENOMEM);
 	}
 
 	nvlist_init(*nvlp, nvflag, priv);
 
 	return (0);
 }
 
 /*
  * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
  */
 static nvpair_t *
 nvp_buf_alloc(nvlist_t *nvl, size_t len)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *buf;
 	nvpair_t *nvp;
 	size_t nvsize;
 
 	/*
 	 * Allocate the buffer
 	 */
 	nvsize = len + offsetof(i_nvp_t, nvi_nvp);
 
 	if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
 		return (NULL);
 
 	nvp = &buf->nvi_nvp;
 	nvp->nvp_size = len;
 
 	return (nvp);
 }
 
 /*
  * nvp_buf_free - de-Allocate an i_nvp_t.
  */
 static void
 nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
 
 	nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
 }
 
 /*
  * nvp_buf_link - link a new nv pair into the nvlist.
  */
 static void
 nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
 
 	/* Put element at end of nvlist */
 	if (priv->nvp_list == NULL) {
 		priv->nvp_list = priv->nvp_last = curr;
 	} else {
 		curr->nvi_prev = priv->nvp_last;
 		priv->nvp_last->nvi_next = curr;
 		priv->nvp_last = curr;
 	}
 }
 
 /*
  * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
  */
 static void
 nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
 
 	/*
 	 * protect nvlist_next_nvpair() against walking on freed memory.
 	 */
 	if (priv->nvp_curr == curr)
 		priv->nvp_curr = curr->nvi_next;
 
 	if (curr == priv->nvp_list)
 		priv->nvp_list = curr->nvi_next;
 	else
 		curr->nvi_prev->nvi_next = curr->nvi_next;
 
 	if (curr == priv->nvp_last)
 		priv->nvp_last = curr->nvi_prev;
 	else
 		curr->nvi_next->nvi_prev = curr->nvi_prev;
 }
 
 /*
  * take a nvpair type and number of elements and make sure the are valid
  */
 static int
 i_validate_type_nelem(data_type_t type, uint_t nelem)
 {
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		if (nelem != 0)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_STRING:
 	case DATA_TYPE_HRTIME:
 	case DATA_TYPE_NVLIST:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		if (nelem != 1)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_BYTE_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 	case DATA_TYPE_STRING_ARRAY:
 	case DATA_TYPE_NVLIST_ARRAY:
 		/* we allow arrays with 0 elements */
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Verify nvp_name_sz and check the name string length.
  */
 static int
 i_validate_nvpair_name(nvpair_t *nvp)
 {
 	if ((nvp->nvp_name_sz <= 0) ||
 	    (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
 		return (EFAULT);
 
 	/* verify the name string, make sure its terminated */
 	if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
 		return (EFAULT);
 
 	return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
 }
 
 static int
 i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
 {
 	switch (type) {
 	case DATA_TYPE_BOOLEAN_VALUE:
 		if (*(boolean_t *)data != B_TRUE &&
 		    *(boolean_t *)data != B_FALSE)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY: {
 		int i;
 
 		for (i = 0; i < nelem; i++)
 			if (((boolean_t *)data)[i] != B_TRUE &&
 			    ((boolean_t *)data)[i] != B_FALSE)
 				return (EINVAL);
 		break;
 	}
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * This function takes a pointer to what should be a nvpair and it's size
  * and then verifies that all the nvpair fields make sense and can be
  * trusted.  This function is used when decoding packed nvpairs.
  */
 static int
 i_validate_nvpair(nvpair_t *nvp)
 {
 	data_type_t type = NVP_TYPE(nvp);
 	int size1, size2;
 
 	/* verify nvp_name_sz, check the name string length */
 	if (i_validate_nvpair_name(nvp) != 0)
 		return (EFAULT);
 
 	if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
 		return (EFAULT);
 
 	/*
 	 * verify nvp_type, nvp_value_elem, and also possibly
 	 * verify string values and get the value size.
 	 */
 	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
 	size1 = nvp->nvp_size - NVP_VALOFF(nvp);
 	if (size2 < 0 || size1 != NV_ALIGN(size2))
 		return (EFAULT);
 
 	return (0);
 }
 
 static int
 nvlist_copy_pairs(const nvlist_t *snvl, nvlist_t *dnvl)
 {
 	const nvpriv_t *priv;
 	const i_nvp_t *curr;
 
 	if ((priv = (const nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		const nvpair_t *nvp = &curr->nvi_nvp;
 		int err;
 
 		if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
 		    NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
 			return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Frees all memory allocated for an nvpair (like embedded lists) with
  * the exception of the nvpair buffer itself.
  */
 static void
 nvpair_free(nvpair_t *nvp)
 {
 	switch (NVP_TYPE(nvp)) {
 	case DATA_TYPE_NVLIST:
 		nvlist_free(EMBEDDED_NVL(nvp));
 		break;
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 		int i;
 
 		for (i = 0; i < NVP_NELEM(nvp); i++)
 			if (nvlp[i] != NULL)
 				nvlist_free(nvlp[i]);
 		break;
 	}
 	default:
 		break;
 	}
 }
 
 /*
  * nvlist_free - free an unpacked nvlist
  */
 void
 nvlist_free(nvlist_t *nvl)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return;
 
 	/*
 	 * Unpacked nvlist are linked through i_nvp_t
 	 */
 	curr = priv->nvp_list;
 	while (curr != NULL) {
 		nvpair_t *nvp = &curr->nvi_nvp;
 		curr = curr->nvi_next;
 
 		nvpair_free(nvp);
 		nvp_buf_free(nvl, nvp);
 	}
 
 	if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
 		nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
 	else
 		nvl->nvl_priv = 0;
 
 	nvt_tab_free(priv);
 	nv_mem_free(priv, priv, sizeof (nvpriv_t));
 }
 
 static int
 nvlist_contains_nvp(const nvlist_t *nvl, const nvpair_t *nvp)
 {
 	const nvpriv_t *priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	const i_nvp_t *curr;
 
 	if (nvp == NULL)
 		return (0);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
 		if (&curr->nvi_nvp == nvp)
 			return (1);
 
 	return (0);
 }
 
 /*
  * Make a copy of nvlist
  */
 int
 nvlist_dup(const nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
 {
 	return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag)));
 }
 
 int
 nvlist_xdup(const nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
 {
 	int err;
 	nvlist_t *ret;
 
 	if (nvl == NULL || nvlp == NULL)
 		return (EINVAL);
 
 	if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
 		return (err);
 
 	if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
 		nvlist_free(ret);
 	else
 		*nvlp = ret;
 
 	return (err);
 }
 
 /*
  * Remove all with matching name
  */
 int
 nvlist_remove_all(nvlist_t *nvl, const char *name)
 {
 	int error = ENOENT;
 
 	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	nvpair_t *nvp;
 	while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
 		VERIFY0(nvlist_remove_nvpair(nvl, nvp));
 		error = 0;
 	}
 
 	return (error);
 }
 
 /*
  * Remove first one with matching name and type
  */
 int
 nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
 {
 	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
 	if (nvp == NULL)
 		return (ENOENT);
 
 	return (nvlist_remove_nvpair(nvl, nvp));
 }
 
 int
 nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	if (nvl == NULL || nvp == NULL)
 		return (EINVAL);
 
 	int err = nvt_remove_nvpair(nvl, nvp);
 	if (err != 0)
 		return (err);
 
 	nvp_buf_unlink(nvl, nvp);
 	nvpair_free(nvp);
 	nvp_buf_free(nvl, nvp);
 	return (0);
 }
 
 /*
  * This function calculates the size of an nvpair value.
  *
  * The data argument controls the behavior in case of the data types
  * 	DATA_TYPE_STRING    	and
  *	DATA_TYPE_STRING_ARRAY
  * Is data == NULL then the size of the string(s) is excluded.
  */
 static int
 i_get_value_size(data_type_t type, const void *data, uint_t nelem)
 {
 	uint64_t value_sz;
 
 	if (i_validate_type_nelem(type, nelem) != 0)
 		return (-1);
 
 	/* Calculate required size for holding value */
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		value_sz = 0;
 		break;
 	case DATA_TYPE_BOOLEAN_VALUE:
 		value_sz = sizeof (boolean_t);
 		break;
 	case DATA_TYPE_BYTE:
 		value_sz = sizeof (uchar_t);
 		break;
 	case DATA_TYPE_INT8:
 		value_sz = sizeof (int8_t);
 		break;
 	case DATA_TYPE_UINT8:
 		value_sz = sizeof (uint8_t);
 		break;
 	case DATA_TYPE_INT16:
 		value_sz = sizeof (int16_t);
 		break;
 	case DATA_TYPE_UINT16:
 		value_sz = sizeof (uint16_t);
 		break;
 	case DATA_TYPE_INT32:
 		value_sz = sizeof (int32_t);
 		break;
 	case DATA_TYPE_UINT32:
 		value_sz = sizeof (uint32_t);
 		break;
 	case DATA_TYPE_INT64:
 		value_sz = sizeof (int64_t);
 		break;
 	case DATA_TYPE_UINT64:
 		value_sz = sizeof (uint64_t);
 		break;
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 		value_sz = sizeof (double);
 		break;
 #endif
 	case DATA_TYPE_STRING:
 		if (data == NULL)
 			value_sz = 0;
 		else
 			value_sz = strlen(data) + 1;
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (boolean_t);
 		break;
 	case DATA_TYPE_BYTE_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uchar_t);
 		break;
 	case DATA_TYPE_INT8_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int8_t);
 		break;
 	case DATA_TYPE_UINT8_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint8_t);
 		break;
 	case DATA_TYPE_INT16_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int16_t);
 		break;
 	case DATA_TYPE_UINT16_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint16_t);
 		break;
 	case DATA_TYPE_INT32_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int32_t);
 		break;
 	case DATA_TYPE_UINT32_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint32_t);
 		break;
 	case DATA_TYPE_INT64_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int64_t);
 		break;
 	case DATA_TYPE_UINT64_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t);
 		break;
 	case DATA_TYPE_STRING_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t);
 
 		if (data != NULL) {
 			char *const *strs = data;
 			uint_t i;
 
 			/* no alignment requirement for strings */
 			for (i = 0; i < nelem; i++) {
 				if (strs[i] == NULL)
 					return (-1);
 				value_sz += strlen(strs[i]) + 1;
 			}
 		}
 		break;
 	case DATA_TYPE_HRTIME:
 		value_sz = sizeof (hrtime_t);
 		break;
 	case DATA_TYPE_NVLIST:
 		value_sz = NV_ALIGN(sizeof (nvlist_t));
 		break;
 	case DATA_TYPE_NVLIST_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t) +
 		    (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
 		break;
 	default:
 		return (-1);
 	}
 
 	return (value_sz > INT32_MAX ? -1 : (int)value_sz);
 }
 
 static int
 nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
 {
 	nvpriv_t *priv;
 	int err;
 
 	if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
 	    nvl->nvl_priv)) == NULL)
 		return (ENOMEM);
 
 	nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
 
 	if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
 		nvlist_free(emb_nvl);
 		emb_nvl->nvl_priv = 0;
 	}
 
 	return (err);
 }
 
 /*
  * nvlist_add_common - Add new <name,value> pair to nvlist
  */
 static int
 nvlist_add_common(nvlist_t *nvl, const char *name,
     data_type_t type, uint_t nelem, const void *data)
 {
 	nvpair_t *nvp;
 	uint_t i;
 
 	int nvp_sz, name_sz, value_sz;
 	int err = 0;
 
 	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	if (nelem != 0 && data == NULL)
 		return (EINVAL);
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) included.
 	 */
 	if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
 		return (EINVAL);
 
 	if (i_validate_nvpair_value(type, nelem, data) != 0)
 		return (EINVAL);
 
 	/*
 	 * If we're adding an nvlist or nvlist array, ensure that we are not
 	 * adding the input nvlist to itself, which would cause recursion,
 	 * and ensure that no NULL nvlist pointers are present.
 	 */
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		if (data == nvl || data == NULL)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **onvlp = (nvlist_t **)data;
 		for (i = 0; i < nelem; i++) {
 			if (onvlp[i] == nvl || onvlp[i] == NULL)
 				return (EINVAL);
 		}
 		break;
 	}
 	default:
 		break;
 	}
 
 	/* calculate sizes of the nvpair elements and the nvpair itself */
 	name_sz = strlen(name) + 1;
 	if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1))
 		return (EINVAL);
 
 	nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
 
 	if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
 		return (ENOMEM);
 
 	ASSERT(nvp->nvp_size == nvp_sz);
 	nvp->nvp_name_sz = name_sz;
 	nvp->nvp_value_elem = nelem;
 	nvp->nvp_type = type;
 	memcpy(NVP_NAME(nvp), name, name_sz);
 
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		break;
 	case DATA_TYPE_STRING_ARRAY: {
 		char *const *strs = data;
 		char *buf = NVP_VALUE(nvp);
 		char **cstrs = (void *)buf;
 
 		/* skip pre-allocated space for pointer array */
 		buf += nelem * sizeof (uint64_t);
 		for (i = 0; i < nelem; i++) {
 			int slen = strlen(strs[i]) + 1;
 			memcpy(buf, strs[i], slen);
 			cstrs[i] = buf;
 			buf += slen;
 		}
 		break;
 	}
 	case DATA_TYPE_NVLIST: {
 		nvlist_t *nnvl = EMBEDDED_NVL(nvp);
 		nvlist_t *onvl = (nvlist_t *)data;
 
 		if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
 			nvp_buf_free(nvl, nvp);
 			return (err);
 		}
 		break;
 	}
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **onvlp = (nvlist_t **)data;
 		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 		nvlist_t *embedded = (nvlist_t *)
 		    ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
 
 		for (i = 0; i < nelem; i++) {
 			if ((err = nvlist_copy_embedded(nvl,
 			    onvlp[i], embedded)) != 0) {
 				/*
 				 * Free any successfully created lists
 				 */
 				nvpair_free(nvp);
 				nvp_buf_free(nvl, nvp);
 				return (err);
 			}
 
 			nvlp[i] = embedded++;
 		}
 		break;
 	}
 	default:
 		memcpy(NVP_VALUE(nvp), data, value_sz);
 	}
 
 	/* if unique name, remove before add */
 	if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
 		(void) nvlist_remove_all(nvl, name);
 	else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
 		(void) nvlist_remove(nvl, name, type);
 
 	err = nvt_add_nvpair(nvl, nvp);
 	if (err != 0) {
 		nvpair_free(nvp);
 		nvp_buf_free(nvl, nvp);
 		return (err);
 	}
 	nvp_buf_link(nvl, nvp);
 
 	return (0);
 }
 
 int
 nvlist_add_boolean(nvlist_t *nvl, const char *name)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
 }
 
 int
 nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
 }
 
 int
 nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
 }
 
 int
 nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
 }
 
 int
 nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
 }
 
 int
 nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
 }
 
 int
 nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
 }
 
 int
 nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
 }
 
 int
 nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
 }
 
 int
 nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
 }
 
 int
 nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
 }
 
 #if !defined(_KERNEL)
 int
 nvlist_add_double(nvlist_t *nvl, const char *name, double val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
 }
 #endif
 
 int
 nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
 }
 
 int
 nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
     const boolean_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
 }
 
 int
 nvlist_add_byte_array(nvlist_t *nvl, const char *name, const uchar_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
 }
 
 int
 nvlist_add_int8_array(nvlist_t *nvl, const char *name, const int8_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint8_array(nvlist_t *nvl, const char *name, const uint8_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
 }
 
 int
 nvlist_add_int16_array(nvlist_t *nvl, const char *name, const int16_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint16_array(nvlist_t *nvl, const char *name, const uint16_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
 }
 
 int
 nvlist_add_int32_array(nvlist_t *nvl, const char *name, const int32_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint32_array(nvlist_t *nvl, const char *name, const uint32_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
 }
 
 int
 nvlist_add_int64_array(nvlist_t *nvl, const char *name, const int64_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint64_array(nvlist_t *nvl, const char *name, const uint64_t *a,
     uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
 }
 
 int
 nvlist_add_string_array(nvlist_t *nvl, const char *name,
     const char *const *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
 }
 
 int
 nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
 }
 
 int
 nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
 }
 
 int
 nvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
     const nvlist_t * const *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
 }
 
 /* reading name-value pairs */
 nvpair_t *
 nvlist_next_nvpair(nvlist_t *nvl, const nvpair_t *nvp)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	curr = NVPAIR2I_NVP(nvp);
 
 	/*
 	 * Ensure that nvp is a valid nvpair on this nvlist.
 	 * NB: nvp_curr is used only as a hint so that we don't always
 	 * have to walk the list to determine if nvp is still on the list.
 	 */
 	if (nvp == NULL)
 		curr = priv->nvp_list;
 	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
 		curr = curr->nvi_next;
 	else
 		curr = NULL;
 
 	priv->nvp_curr = curr;
 
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
 nvpair_t *
 nvlist_prev_nvpair(nvlist_t *nvl, const nvpair_t *nvp)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	curr = NVPAIR2I_NVP(nvp);
 
 	if (nvp == NULL)
 		curr = priv->nvp_last;
 	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
 		curr = curr->nvi_prev;
 	else
 		curr = NULL;
 
 	priv->nvp_curr = curr;
 
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
 boolean_t
 nvlist_empty(const nvlist_t *nvl)
 {
 	const nvpriv_t *priv;
 
 	if (nvl == NULL ||
 	    (priv = (const nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (B_TRUE);
 
 	return (priv->nvp_list == NULL);
 }
 
 const char *
 nvpair_name(const nvpair_t *nvp)
 {
 	return (NVP_NAME(nvp));
 }
 
 data_type_t
 nvpair_type(const nvpair_t *nvp)
 {
 	return (NVP_TYPE(nvp));
 }
 
 int
 nvpair_type_is_array(const nvpair_t *nvp)
 {
 	data_type_t type = NVP_TYPE(nvp);
 
 	if ((type == DATA_TYPE_BYTE_ARRAY) ||
 	    (type == DATA_TYPE_INT8_ARRAY) ||
 	    (type == DATA_TYPE_UINT8_ARRAY) ||
 	    (type == DATA_TYPE_INT16_ARRAY) ||
 	    (type == DATA_TYPE_UINT16_ARRAY) ||
 	    (type == DATA_TYPE_INT32_ARRAY) ||
 	    (type == DATA_TYPE_UINT32_ARRAY) ||
 	    (type == DATA_TYPE_INT64_ARRAY) ||
 	    (type == DATA_TYPE_UINT64_ARRAY) ||
 	    (type == DATA_TYPE_BOOLEAN_ARRAY) ||
 	    (type == DATA_TYPE_STRING_ARRAY) ||
 	    (type == DATA_TYPE_NVLIST_ARRAY))
 		return (1);
 	return (0);
 
 }
 
 static int
 nvpair_value_common(const nvpair_t *nvp, data_type_t type, uint_t *nelem,
     void *data)
 {
 	int value_sz;
 
 	if (nvp == NULL || nvpair_type(nvp) != type)
 		return (EINVAL);
 
 	/*
 	 * For non-array types, we copy the data.
 	 * For array types (including string), we set a pointer.
 	 */
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		if (nelem != NULL)
 			*nelem = 0;
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_HRTIME:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		if (data == NULL)
 			return (EINVAL);
 		if ((value_sz = i_get_value_size(type, NULL, 1)) < 0)
 			return (EINVAL);
 		memcpy(data, NVP_VALUE(nvp), (size_t)value_sz);
 		if (nelem != NULL)
 			*nelem = 1;
 		break;
 
 	case DATA_TYPE_NVLIST:
 	case DATA_TYPE_STRING:
 		if (data == NULL)
 			return (EINVAL);
 		/*
 		 * This discards the const from nvp, so all callers for these
 		 * types must not accept const nvpairs.
 		 */
 		*(void **)data = (void *)NVP_VALUE(nvp);
 		if (nelem != NULL)
 			*nelem = 1;
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_BYTE_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 	case DATA_TYPE_STRING_ARRAY:
 	case DATA_TYPE_NVLIST_ARRAY:
 		if (nelem == NULL || data == NULL)
 			return (EINVAL);
 		/*
 		 * This discards the const from nvp, so all callers for these
 		 * types must not accept const nvpairs.
 		 */
 		if ((*nelem = NVP_NELEM(nvp)) != 0)
 			*(void **)data = (void *)NVP_VALUE(nvp);
 		else
 			*(void **)data = NULL;
 		break;
 
 	default:
 		return (ENOTSUP);
 	}
 
 	return (0);
 }
 
 static int
 nvlist_lookup_common(const nvlist_t *nvl, const char *name, data_type_t type,
     uint_t *nelem, void *data)
 {
 	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
 		return (ENOTSUP);
 
 	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
 	if (nvp == NULL)
 		return (ENOENT);
 
 	return (nvpair_value_common(nvp, type, nelem, data));
 }
 
 int
 nvlist_lookup_boolean(const nvlist_t *nvl, const char *name)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
 }
 
 int
 nvlist_lookup_boolean_value(const nvlist_t *nvl, const char *name,
     boolean_t *val)
 {
 	return (nvlist_lookup_common(nvl, name,
 	    DATA_TYPE_BOOLEAN_VALUE, NULL, val));
 }
 
 int
 nvlist_lookup_byte(const nvlist_t *nvl, const char *name, uchar_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
 }
 
 int
 nvlist_lookup_int8(const nvlist_t *nvl, const char *name, int8_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
 }
 
 int
 nvlist_lookup_uint8(const nvlist_t *nvl, const char *name, uint8_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
 }
 
 int
 nvlist_lookup_int16(const nvlist_t *nvl, const char *name, int16_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
 }
 
 int
 nvlist_lookup_uint16(const nvlist_t *nvl, const char *name, uint16_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
 }
 
 int
 nvlist_lookup_int32(const nvlist_t *nvl, const char *name, int32_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
 }
 
 int
 nvlist_lookup_uint32(const nvlist_t *nvl, const char *name, uint32_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
 }
 
 int
 nvlist_lookup_int64(const nvlist_t *nvl, const char *name, int64_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
 }
 
 int
 nvlist_lookup_uint64(const nvlist_t *nvl, const char *name, uint64_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
 }
 
 #if !defined(_KERNEL)
 int
 nvlist_lookup_double(const nvlist_t *nvl, const char *name, double *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
 }
 #endif
 
 int
 nvlist_lookup_string(const nvlist_t *nvl, const char *name, const char **val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
 }
 
 int
 nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
 }
 
 int
 nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
     boolean_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name,
 	    DATA_TYPE_BOOLEAN_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
     uchar_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
     uint8_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
     int16_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
     uint16_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
     int32_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
     uint32_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
     int64_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
     uint64_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
     char ***a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
     nvlist_t ***a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
 }
 
 int
 nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
 {
 	va_list ap;
 	char *name;
 	int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
 	int ret = 0;
 
 	va_start(ap, flag);
 	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
 		data_type_t type;
 		void *val;
 		uint_t *nelem;
 
 		switch (type = va_arg(ap, data_type_t)) {
 		case DATA_TYPE_BOOLEAN:
 			ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
 			break;
 
 		case DATA_TYPE_BOOLEAN_VALUE:
 		case DATA_TYPE_BYTE:
 		case DATA_TYPE_INT8:
 		case DATA_TYPE_UINT8:
 		case DATA_TYPE_INT16:
 		case DATA_TYPE_UINT16:
 		case DATA_TYPE_INT32:
 		case DATA_TYPE_UINT32:
 		case DATA_TYPE_INT64:
 		case DATA_TYPE_UINT64:
 		case DATA_TYPE_HRTIME:
 		case DATA_TYPE_STRING:
 		case DATA_TYPE_NVLIST:
 #if !defined(_KERNEL)
 		case DATA_TYPE_DOUBLE:
 #endif
 			val = va_arg(ap, void *);
 			ret = nvlist_lookup_common(nvl, name, type, NULL, val);
 			break;
 
 		case DATA_TYPE_BYTE_ARRAY:
 		case DATA_TYPE_BOOLEAN_ARRAY:
 		case DATA_TYPE_INT8_ARRAY:
 		case DATA_TYPE_UINT8_ARRAY:
 		case DATA_TYPE_INT16_ARRAY:
 		case DATA_TYPE_UINT16_ARRAY:
 		case DATA_TYPE_INT32_ARRAY:
 		case DATA_TYPE_UINT32_ARRAY:
 		case DATA_TYPE_INT64_ARRAY:
 		case DATA_TYPE_UINT64_ARRAY:
 		case DATA_TYPE_STRING_ARRAY:
 		case DATA_TYPE_NVLIST_ARRAY:
 			val = va_arg(ap, void *);
 			nelem = va_arg(ap, uint_t *);
 			ret = nvlist_lookup_common(nvl, name, type, nelem, val);
 			break;
 
 		default:
 			ret = EINVAL;
 		}
 
 		if (ret == ENOENT && noentok)
 			ret = 0;
 	}
 	va_end(ap);
 
 	return (ret);
 }
 
 /*
  * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
  * returns zero and a pointer to the matching nvpair is returned in '*ret'
  * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
  * multiple levels of embedded nvlists, with 'sep' as the separator. As an
  * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
  * "a.d[3].e[1]".  This matches the C syntax for array embed (for convenience,
  * code also supports "a.d[3]e[1]" syntax).
  *
  * If 'ip' is non-NULL and the last name component is an array, return the
  * value of the "...[index]" array index in *ip. For an array reference that
  * is not indexed, *ip will be returned as -1. If there is a syntax error in
  * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
  * inside the 'name' string where the syntax error was detected.
  */
 static int
 nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
     nvpair_t **ret, int *ip, const char **ep)
 {
 	nvpair_t	*nvp;
 	const char	*np;
 	char		*sepp = NULL;
 	char		*idxp, *idxep;
 	nvlist_t	**nva;
 	long		idx = 0;
 	int		n;
 
 	if (ip)
 		*ip = -1;			/* not indexed */
 	if (ep)
 		*ep = NULL;
 
 	if ((nvl == NULL) || (name == NULL))
 		return (EINVAL);
 
 	sepp = NULL;
 	idx = 0;
 	/* step through components of name */
 	for (np = name; np && *np; np = sepp) {
 		/* ensure unique names */
 		if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
 			return (ENOTSUP);
 
 		/* skip white space */
 		skip_whitespace(np);
 		if (*np == 0)
 			break;
 
 		/* set 'sepp' to end of current component 'np' */
 		if (sep)
 			sepp = strchr(np, sep);
 		else
 			sepp = NULL;
 
 		/* find start of next "[ index ]..." */
 		idxp = strchr(np, '[');
 
 		/* if sepp comes first, set idxp to NULL */
 		if (sepp && idxp && (sepp < idxp))
 			idxp = NULL;
 
 		/*
 		 * At this point 'idxp' is set if there is an index
 		 * expected for the current component.
 		 */
 		if (idxp) {
 			/* set 'n' to length of current 'np' name component */
 			n = idxp++ - np;
 
 			/* keep sepp up to date for *ep use as we advance */
 			skip_whitespace(idxp);
 			sepp = idxp;
 
 			/* determine the index value */
 #if defined(_KERNEL)
 			if (ddi_strtol(idxp, &idxep, 0, &idx))
 				goto fail;
 #else
 			idx = strtol(idxp, &idxep, 0);
 #endif
 			if (idxep == idxp)
 				goto fail;
 
 			/* keep sepp up to date for *ep use as we advance */
 			sepp = idxep;
 
 			/* skip white space index value and check for ']' */
 			skip_whitespace(sepp);
 			if (*sepp++ != ']')
 				goto fail;
 
 			/* for embedded arrays, support C syntax: "a[1].b" */
 			skip_whitespace(sepp);
 			if (sep && (*sepp == sep))
 				sepp++;
 		} else if (sepp) {
 			n = sepp++ - np;
 		} else {
 			n = strlen(np);
 		}
 
 		/* trim trailing whitespace by reducing length of 'np' */
 		if (n == 0)
 			goto fail;
 		for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
 			;
 		n++;
 
 		/* skip whitespace, and set sepp to NULL if complete */
 		if (sepp) {
 			skip_whitespace(sepp);
 			if (*sepp == 0)
 				sepp = NULL;
 		}
 
 		/*
 		 * At this point:
 		 * o  'n' is the length of current 'np' component.
 		 * o  'idxp' is set if there was an index, and value 'idx'.
 		 * o  'sepp' is set to the beginning of the next component,
 		 *    and set to NULL if we have no more components.
 		 *
 		 * Search for nvpair with matching component name.
 		 */
 		for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(nvl, nvp)) {
 
 			/* continue if no match on name */
 			if (strncmp(np, nvpair_name(nvp), n) ||
 			    (strlen(nvpair_name(nvp)) != n))
 				continue;
 
 			/* if indexed, verify type is array oriented */
 			if (idxp && !nvpair_type_is_array(nvp))
 				goto fail;
 
 			/*
 			 * Full match found, return nvp and idx if this
 			 * was the last component.
 			 */
 			if (sepp == NULL) {
 				if (ret)
 					*ret = nvp;
 				if (ip && idxp)
 					*ip = (int)idx;	/* return index */
 				return (0);		/* found */
 			}
 
 			/*
 			 * More components: current match must be
 			 * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
 			 * to support going deeper.
 			 */
 			if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
 				nvl = EMBEDDED_NVL(nvp);
 				break;
 			} else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
 				if (nvpair_value_nvlist_array(nvp,
 				    &nva, (uint_t *)&n) != 0)
 					goto fail;
 				if (nva == NULL)
 					goto fail;
 				if ((n < 0) || (idx >= n))
 					goto fail;
 				nvl = nva[idx];
 				break;
 			}
 
 			/* type does not support more levels */
 			goto fail;
 		}
 		if (nvp == NULL)
 			goto fail;		/* 'name' not found */
 
 		/* search for match of next component in embedded 'nvl' list */
 	}
 
 fail:	if (ep && sepp)
 		*ep = sepp;
 	return (EINVAL);
 }
 
 /*
  * Return pointer to nvpair with specified 'name'.
  */
 int
 nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
 {
 	return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
 }
 
 /*
  * Determine if named nvpair exists in nvlist (use embedded separator of '.'
  * and return array index).  See nvlist_lookup_nvpair_ei_sep for more detailed
  * description.
  */
 int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
     const char *name, nvpair_t **ret, int *ip, const char **ep)
 {
 	return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
 }
 
 boolean_t
 nvlist_exists(const nvlist_t *nvl, const char *name)
 {
 	nvpriv_t *priv;
 	nvpair_t *nvp;
 	i_nvp_t *curr;
 
 	if (name == NULL || nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (B_FALSE);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		nvp = &curr->nvi_nvp;
 
 		if (strcmp(name, NVP_NAME(nvp)) == 0)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 nvpair_value_boolean_value(const nvpair_t *nvp, boolean_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
 }
 
 int
 nvpair_value_byte(const nvpair_t *nvp, uchar_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
 }
 
 int
 nvpair_value_int8(const nvpair_t *nvp, int8_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
 }
 
 int
 nvpair_value_uint8(const nvpair_t *nvp, uint8_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
 }
 
 int
 nvpair_value_int16(const nvpair_t *nvp, int16_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
 }
 
 int
 nvpair_value_uint16(const nvpair_t *nvp, uint16_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
 }
 
 int
 nvpair_value_int32(const nvpair_t *nvp, int32_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
 }
 
 int
 nvpair_value_uint32(const nvpair_t *nvp, uint32_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
 }
 
 int
 nvpair_value_int64(const nvpair_t *nvp, int64_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
 }
 
 int
 nvpair_value_uint64(const nvpair_t *nvp, uint64_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
 }
 
 #if !defined(_KERNEL)
 int
 nvpair_value_double(const nvpair_t *nvp, double *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
 }
 #endif
 
 int
 nvpair_value_string(const nvpair_t *nvp, const char **val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
 }
 
 int
 nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
 }
 
 int
 nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_string_array(nvpair_t *nvp, const char ***val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
 }
 
 /*
  * Add specified pair to the list.
  */
 int
 nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	if (nvl == NULL || nvp == NULL)
 		return (EINVAL);
 
 	return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
 	    NVP_NELEM(nvp), NVP_VALUE(nvp)));
 }
 
 /*
  * Merge the supplied nvlists and put the result in dst.
  * The merged list will contain all names specified in both lists,
  * the values are taken from nvl in the case of duplicates.
  * Return 0 on success.
  */
 int
 nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
 {
 	(void) flag;
 
 	if (nvl == NULL || dst == NULL)
 		return (EINVAL);
 
 	if (dst != nvl)
 		return (nvlist_copy_pairs(nvl, dst));
 
 	return (0);
 }
 
 /*
  * Encoding related routines
  */
 #define	NVS_OP_ENCODE	0
 #define	NVS_OP_DECODE	1
 #define	NVS_OP_GETSIZE	2
 
 typedef struct nvs_ops nvs_ops_t;
 
 typedef struct {
 	int		nvs_op;
 	const nvs_ops_t	*nvs_ops;
 	void		*nvs_private;
 	nvpriv_t	*nvs_priv;
 	int		nvs_recursion;
 } nvstream_t;
 
 /*
  * nvs operations are:
  *   - nvs_nvlist
  *     encoding / decoding of an nvlist header (nvlist_t)
  *     calculates the size used for header and end detection
  *
  *   - nvs_nvpair
  *     responsible for the first part of encoding / decoding of an nvpair
  *     calculates the decoded size of an nvpair
  *
  *   - nvs_nvp_op
  *     second part of encoding / decoding of an nvpair
  *
  *   - nvs_nvp_size
  *     calculates the encoding size of an nvpair
  *
  *   - nvs_nvl_fini
  *     encodes the end detection mark (zeros).
  */
 struct nvs_ops {
 	int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
 	int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
 	int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
 	int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
 	int (*nvs_nvl_fini)(nvstream_t *);
 };
 
 typedef struct {
 	char	nvh_encoding;	/* nvs encoding method */
 	char	nvh_endian;	/* nvs endian */
 	char	nvh_reserved1;	/* reserved for future use */
 	char	nvh_reserved2;	/* reserved for future use */
 } nvs_header_t;
 
 static int
 nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr;
 
 	/*
 	 * Walk nvpair in list and encode each nvpair
 	 */
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
 		if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
 			return (EFAULT);
 
 	return (nvs->nvs_ops->nvs_nvl_fini(nvs));
 }
 
 static int
 nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
 {
 	nvpair_t *nvp;
 	size_t nvsize;
 	int err;
 
 	/*
 	 * Get decoded size of next pair in stream, alloc
 	 * memory for nvpair_t, then decode the nvpair
 	 */
 	while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
 		if (nvsize == 0) /* end of list */
 			break;
 
 		/* make sure len makes sense */
 		if (nvsize < NVP_SIZE_CALC(1, 0))
 			return (EFAULT);
 
 		if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
 			return (ENOMEM);
 
 		if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
 			nvp_buf_free(nvl, nvp);
 			return (err);
 		}
 
 		if (i_validate_nvpair(nvp) != 0) {
 			nvpair_free(nvp);
 			nvp_buf_free(nvl, nvp);
 			return (EFAULT);
 		}
 
 		err = nvt_add_nvpair(nvl, nvp);
 		if (err != 0) {
 			nvpair_free(nvp);
 			nvp_buf_free(nvl, nvp);
 			return (err);
 		}
 		nvp_buf_link(nvl, nvp);
 	}
 	return (err);
 }
 
 static int
 nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr;
 	uint64_t nvsize = *buflen;
 	size_t size;
 
 	/*
 	 * Get encoded size of nvpairs in nvlist
 	 */
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
 			return (EINVAL);
 
 		if ((nvsize += size) > INT32_MAX)
 			return (EINVAL);
 	}
 
 	*buflen = nvsize;
 	return (0);
 }
 
 static int
 nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
 {
 	int err;
 
 	if (nvl->nvl_priv == 0)
 		return (EFAULT);
 
 	/*
 	 * Perform the operation, starting with header, then each nvpair
 	 */
 	if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
 		return (err);
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		err = nvs_encode_pairs(nvs, nvl);
 		break;
 
 	case NVS_OP_DECODE:
 		err = nvs_decode_pairs(nvs, nvl);
 		break;
 
 	case NVS_OP_GETSIZE:
 		err = nvs_getsize_pairs(nvs, nvl, buflen);
 		break;
 
 	default:
 		err = EINVAL;
 	}
 
 	return (err);
 }
 
 static int
 nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE: {
 		int err;
 
 		if (nvs->nvs_recursion >= nvpair_max_recursion)
 			return (EINVAL);
 		nvs->nvs_recursion++;
 		err = nvs_operation(nvs, embedded, NULL);
 		nvs->nvs_recursion--;
 		return (err);
 	}
 	case NVS_OP_DECODE: {
 		nvpriv_t *priv;
 		int err;
 
 		if (embedded->nvl_version != NV_VERSION)
 			return (ENOTSUP);
 
 		if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
 			return (ENOMEM);
 
 		nvlist_init(embedded, embedded->nvl_nvflag, priv);
 
 		if (nvs->nvs_recursion >= nvpair_max_recursion) {
 			nvlist_free(embedded);
 			return (EINVAL);
 		}
 		nvs->nvs_recursion++;
 		if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
 			nvlist_free(embedded);
 		nvs->nvs_recursion--;
 		return (err);
 	}
 	default:
 		break;
 	}
 
 	return (EINVAL);
 }
 
 static int
 nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	size_t nelem = NVP_NELEM(nvp);
 	nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 	int i;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		for (i = 0; i < nelem; i++)
 			if (nvs_embedded(nvs, nvlp[i]) != 0)
 				return (EFAULT);
 		break;
 
 	case NVS_OP_DECODE: {
 		size_t len = nelem * sizeof (uint64_t);
 		nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
 
 		memset(nvlp, 0, len);	/* don't trust packed data */
 		for (i = 0; i < nelem; i++) {
 			if (nvs_embedded(nvs, embedded) != 0) {
 				nvpair_free(nvp);
 				return (EFAULT);
 			}
 
 			nvlp[i] = embedded++;
 		}
 		break;
 	}
 	case NVS_OP_GETSIZE: {
 		uint64_t nvsize = 0;
 
 		for (i = 0; i < nelem; i++) {
 			size_t nvp_sz = 0;
 
 			if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
 				return (EINVAL);
 
 			if ((nvsize += nvp_sz) > INT32_MAX)
 				return (EINVAL);
 		}
 
 		*size = nvsize;
 		break;
 	}
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
 static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
 
 /*
  * Common routine for nvlist operations:
  * encode, decode, getsize (encoded size).
  */
 static int
 nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
     int nvs_op)
 {
 	int err = 0;
 	nvstream_t nvs;
 	int nvl_endian;
 #if defined(_ZFS_LITTLE_ENDIAN)
 	int host_endian = 1;
 #elif defined(_ZFS_BIG_ENDIAN)
 	int host_endian = 0;
 #else
 #error "No endian defined!"
 #endif	/* _ZFS_LITTLE_ENDIAN */
 	nvs_header_t *nvh;
 
 	if (buflen == NULL || nvl == NULL ||
 	    (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	nvs.nvs_op = nvs_op;
 	nvs.nvs_recursion = 0;
 
 	/*
 	 * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
 	 * a buffer is allocated.  The first 4 bytes in the buffer are
 	 * used for encoding method and host endian.
 	 */
 	switch (nvs_op) {
 	case NVS_OP_ENCODE:
 		if (buf == NULL || *buflen < sizeof (nvs_header_t))
 			return (EINVAL);
 
 		nvh = (void *)buf;
 		nvh->nvh_encoding = encoding;
 		nvh->nvh_endian = nvl_endian = host_endian;
 		nvh->nvh_reserved1 = 0;
 		nvh->nvh_reserved2 = 0;
 		break;
 
 	case NVS_OP_DECODE:
 		if (buf == NULL || *buflen < sizeof (nvs_header_t))
 			return (EINVAL);
 
 		/* get method of encoding from first byte */
 		nvh = (void *)buf;
 		encoding = nvh->nvh_encoding;
 		nvl_endian = nvh->nvh_endian;
 		break;
 
 	case NVS_OP_GETSIZE:
 		nvl_endian = host_endian;
 
 		/*
 		 * add the size for encoding
 		 */
 		*buflen = sizeof (nvs_header_t);
 		break;
 
 	default:
 		return (ENOTSUP);
 	}
 
 	/*
 	 * Create an nvstream with proper encoding method
 	 */
 	switch (encoding) {
 	case NV_ENCODE_NATIVE:
 		/*
 		 * check endianness, in case we are unpacking
 		 * from a file
 		 */
 		if (nvl_endian != host_endian)
 			return (ENOTSUP);
 		err = nvs_native(&nvs, nvl, buf, buflen);
 		break;
 	case NV_ENCODE_XDR:
 		err = nvs_xdr(&nvs, nvl, buf, buflen);
 		break;
 	default:
 		err = ENOTSUP;
 		break;
 	}
 
 	return (err);
 }
 
 int
 nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
 {
 	return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
 }
 
 /*
  * Pack nvlist into contiguous memory
  */
 int
 nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
     int kmflag)
 {
 	return (nvlist_xpack(nvl, bufp, buflen, encoding,
 	    nvlist_nv_alloc(kmflag)));
 }
 
 int
 nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
     nv_alloc_t *nva)
 {
 	nvpriv_t nvpriv;
 	size_t alloc_size;
 	char *buf;
 	int err;
 
 	if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
 		return (EINVAL);
 
 	if (*bufp != NULL)
 		return (nvlist_common(nvl, *bufp, buflen, encoding,
 		    NVS_OP_ENCODE));
 
 	/*
 	 * Here is a difficult situation:
 	 * 1. The nvlist has fixed allocator properties.
 	 *    All other nvlist routines (like nvlist_add_*, ...) use
 	 *    these properties.
 	 * 2. When using nvlist_pack() the user can specify their own
 	 *    allocator properties (e.g. by using KM_NOSLEEP).
 	 *
 	 * We use the user specified properties (2). A clearer solution
 	 * will be to remove the kmflag from nvlist_pack(), but we will
 	 * not change the interface.
 	 */
 	nv_priv_init(&nvpriv, nva, 0);
 
 	if ((err = nvlist_size(nvl, &alloc_size, encoding)))
 		return (err);
 
 	if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
 		return (ENOMEM);
 
 	if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
 	    NVS_OP_ENCODE)) != 0) {
 		nv_mem_free(&nvpriv, buf, alloc_size);
 	} else {
 		*buflen = alloc_size;
 		*bufp = buf;
 	}
 
 	return (err);
 }
 
 /*
  * Unpack buf into an nvlist_t
  */
 int
 nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
 {
 	return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag)));
 }
 
 int
 nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
 {
 	nvlist_t *nvl;
 	int err;
 
 	if (nvlp == NULL)
 		return (EINVAL);
 
 	if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
 		return (err);
 
 	if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE,
 	    NVS_OP_DECODE)) != 0)
 		nvlist_free(nvl);
 	else
 		*nvlp = nvl;
 
 	return (err);
 }
 
 /*
  * Native encoding functions
  */
 typedef struct {
 	/*
 	 * This structure is used when decoding a packed nvpair in
 	 * the native format.  n_base points to a buffer containing the
 	 * packed nvpair.  n_end is a pointer to the end of the buffer.
 	 * (n_end actually points to the first byte past the end of the
 	 * buffer.)  n_curr is a pointer that lies between n_base and n_end.
 	 * It points to the current data that we are decoding.
 	 * The amount of data left in the buffer is equal to n_end - n_curr.
 	 * n_flag is used to recognize a packed embedded list.
 	 */
 	caddr_t n_base;
 	caddr_t n_end;
 	caddr_t n_curr;
 	uint_t  n_flag;
 } nvs_native_t;
 
 static int
 nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
     size_t buflen)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		nvs->nvs_private = native;
 		native->n_curr = native->n_base = buf;
 		native->n_end = buf + buflen;
 		native->n_flag = 0;
 		return (0);
 
 	case NVS_OP_GETSIZE:
 		nvs->nvs_private = native;
 		native->n_curr = native->n_base = native->n_end = NULL;
 		native->n_flag = 0;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static void
 nvs_native_destroy(nvstream_t *nvs)
 {
 	nvs->nvs_private = NULL;
 }
 
 static int
 native_cp(nvstream_t *nvs, void *buf, size_t size)
 {
 	nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 
 	if (native->n_curr + size > native->n_end)
 		return (EFAULT);
 
 	/*
 	 * The memcpy() below eliminates alignment requirement
 	 * on the buffer (stream) and is preferred over direct access.
 	 */
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		memcpy(native->n_curr, buf, size);
 		break;
 	case NVS_OP_DECODE:
 		memcpy(buf, native->n_curr, size);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	native->n_curr += size;
 	return (0);
 }
 
 /*
  * operate on nvlist_t header
  */
 static int
 nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
 {
 	nvs_native_t *native = nvs->nvs_private;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		if (native->n_flag)
 			return (0);	/* packed embedded list */
 
 		native->n_flag = 1;
 
 		/* copy version and nvflag of the nvlist_t */
 		if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
 		    native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
 			return (EFAULT);
 
 		return (0);
 
 	case NVS_OP_GETSIZE:
 		/*
 		 * if calculate for packed embedded list
 		 * 	4 for end of the embedded list
 		 * else
 		 * 	2 * sizeof (int32_t) for nvl_version and nvl_nvflag
 		 * 	and 4 for end of the entire list
 		 */
 		if (native->n_flag) {
 			*size += 4;
 		} else {
 			native->n_flag = 1;
 			*size += 2 * sizeof (int32_t) + 4;
 		}
 
 		return (0);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 static int
 nvs_native_nvl_fini(nvstream_t *nvs)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		/*
 		 * Add 4 zero bytes at end of nvlist. They are used
 		 * for end detection by the decode routine.
 		 */
 		if (native->n_curr + sizeof (int) > native->n_end)
 			return (EFAULT);
 
 		memset(native->n_curr, 0, sizeof (int));
 		native->n_curr += sizeof (int);
 	}
 
 	return (0);
 }
 
 static int
 nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		nvlist_t *packed = (void *)
 		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
 		/*
 		 * Null out the pointer that is meaningless in the packed
 		 * structure. The address may not be aligned, so we have
 		 * to use memset.
 		 */
 		memset((char *)packed + offsetof(nvlist_t, nvl_priv),
 		    0, sizeof (uint64_t));
 	}
 
 	return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
 }
 
 static int
 nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
 		size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
 		nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len);
 		int i;
 		/*
 		 * Null out pointers that are meaningless in the packed
 		 * structure. The addresses may not be aligned, so we have
 		 * to use memset.
 		 */
 		memset(value, 0, len);
 
 		for (i = 0; i < NVP_NELEM(nvp); i++, packed++)
 			/*
 			 * Null out the pointer that is meaningless in the
 			 * packed structure. The address may not be aligned,
 			 * so we have to use memset.
 			 */
 			memset((char *)packed + offsetof(nvlist_t, nvl_priv),
 			    0, sizeof (uint64_t));
 	}
 
 	return (nvs_embedded_nvl_array(nvs, nvp, NULL));
 }
 
 static void
 nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE: {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		uint64_t *strp = (void *)
 		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
 		/*
 		 * Null out pointers that are meaningless in the packed
 		 * structure. The addresses may not be aligned, so we have
 		 * to use memset.
 		 */
 		memset(strp, 0, NVP_NELEM(nvp) * sizeof (uint64_t));
 		break;
 	}
 	case NVS_OP_DECODE: {
 		char **strp = (void *)NVP_VALUE(nvp);
 		char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
 		int i;
 
 		for (i = 0; i < NVP_NELEM(nvp); i++) {
 			strp[i] = buf;
 			buf += strlen(buf) + 1;
 		}
 		break;
 	}
 	}
 }
 
 static int
 nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 {
 	data_type_t type;
 	int value_sz;
 	int ret = 0;
 
 	/*
 	 * We do the initial memcpy of the data before we look at
 	 * the nvpair type, because when we're decoding, we won't
 	 * have the correct values for the pair until we do the memcpy.
 	 */
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
 			return (EFAULT);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/* verify nvp_name_sz, check the name string length */
 	if (i_validate_nvpair_name(nvp) != 0)
 		return (EFAULT);
 
 	type = NVP_TYPE(nvp);
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
 	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
 		return (EFAULT);
 
 	if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
 		return (EFAULT);
 
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		ret = nvpair_native_embedded(nvs, nvp);
 		break;
 	case DATA_TYPE_NVLIST_ARRAY:
 		ret = nvpair_native_embedded_array(nvs, nvp);
 		break;
 	case DATA_TYPE_STRING_ARRAY:
 		nvpair_native_string_array(nvs, nvp);
 		break;
 	default:
 		break;
 	}
 
 	return (ret);
 }
 
 static int
 nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	uint64_t nvp_sz = nvp->nvp_size;
 
 	switch (NVP_TYPE(nvp)) {
 	case DATA_TYPE_NVLIST: {
 		size_t nvsize = 0;
 
 		if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 	case DATA_TYPE_NVLIST_ARRAY: {
 		size_t nvsize;
 
 		if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 	default:
 		break;
 	}
 
 	if (nvp_sz > INT32_MAX)
 		return (EINVAL);
 
 	*size = nvp_sz;
 
 	return (0);
 }
 
 static int
 nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		return (nvs_native_nvp_op(nvs, nvp));
 
 	case NVS_OP_DECODE: {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		int32_t decode_len;
 
 		/* try to read the size value from the stream */
 		if (native->n_curr + sizeof (int32_t) > native->n_end)
 			return (EFAULT);
 		memcpy(&decode_len, native->n_curr, sizeof (int32_t));
 
 		/* sanity check the size value */
 		if (decode_len < 0 ||
 		    decode_len > native->n_end - native->n_curr)
 			return (EFAULT);
 
 		*size = decode_len;
 
 		/*
 		 * If at the end of the stream then move the cursor
 		 * forward, otherwise nvpair_native_op() will read
 		 * the entire nvpair at the same cursor position.
 		 */
 		if (*size == 0)
 			native->n_curr += sizeof (int32_t);
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static const nvs_ops_t nvs_native_ops = {
 	.nvs_nvlist = nvs_native_nvlist,
 	.nvs_nvpair = nvs_native_nvpair,
 	.nvs_nvp_op = nvs_native_nvp_op,
 	.nvs_nvp_size = nvs_native_nvp_size,
 	.nvs_nvl_fini = nvs_native_nvl_fini
 };
 
 static int
 nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
 {
 	nvs_native_t native;
 	int err;
 
 	nvs->nvs_ops = &nvs_native_ops;
 
 	if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
 	    *buflen - sizeof (nvs_header_t))) != 0)
 		return (err);
 
 	err = nvs_operation(nvs, nvl, buflen);
 
 	nvs_native_destroy(nvs);
 
 	return (err);
 }
 
 /*
  * XDR encoding functions
  *
  * An xdr packed nvlist is encoded as:
  *
  *  - encoding method and host endian (4 bytes)
  *  - nvl_version (4 bytes)
  *  - nvl_nvflag (4 bytes)
  *
  *  - encoded nvpairs, the format of one xdr encoded nvpair is:
  *	- encoded size of the nvpair (4 bytes)
  *	- decoded size of the nvpair (4 bytes)
  *	- name string, (4 + sizeof(NV_ALIGN4(string))
  *	  a string is coded as size (4 bytes) and data
  *	- data type (4 bytes)
  *	- number of elements in the nvpair (4 bytes)
  *	- data
  *
  *  - 2 zero's for end of the entire list (8 bytes)
  */
 static int
 nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
 {
 	/* xdr data must be 4 byte aligned */
 	if ((ulong_t)buf % 4 != 0)
 		return (EFAULT);
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
 		nvs->nvs_private = xdr;
 		return (0);
 	case NVS_OP_DECODE:
 		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
 		nvs->nvs_private = xdr;
 		return (0);
 	case NVS_OP_GETSIZE:
 		nvs->nvs_private = NULL;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static void
 nvs_xdr_destroy(nvstream_t *nvs)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		nvs->nvs_private = NULL;
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE: {
 		XDR 	*xdr = nvs->nvs_private;
 
 		if (!xdr_int(xdr, &nvl->nvl_version) ||
 		    !xdr_u_int(xdr, &nvl->nvl_nvflag))
 			return (EFAULT);
 		break;
 	}
 	case NVS_OP_GETSIZE: {
 		/*
 		 * 2 * 4 for nvl_version + nvl_nvflag
 		 * and 8 for end of the entire list
 		 */
 		*size += 2 * 4 + 8;
 		break;
 	}
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 nvs_xdr_nvl_fini(nvstream_t *nvs)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		XDR *xdr = nvs->nvs_private;
 		int zero = 0;
 
 		if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
 			return (EFAULT);
 	}
 
 	return (0);
 }
 
 /*
  * xdrproc_t-compatible callbacks for xdr_array()
  */
 
 #if defined(_KERNEL) && defined(__linux__) /* Linux kernel */
 
 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
 nvs_xdr_nvp_##type(XDR *xdrs, void *ptr)	\
 {						\
 	return (xdr_##type(xdrs, ptr));		\
 }
 
 #elif !defined(_KERNEL) && defined(XDR_CONTROL) /* tirpc */
 
 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
 nvs_xdr_nvp_##type(XDR *xdrs, ...)		\
 {						\
 	va_list args;				\
 	void *ptr;				\
 						\
 	va_start(args, xdrs);			\
 	ptr = va_arg(args, void *);		\
 	va_end(args);				\
 						\
 	return (xdr_##type(xdrs, ptr));		\
 }
 
 #else /* FreeBSD, sunrpc */
 
 #define	NVS_BUILD_XDRPROC_T(type)		\
 static bool_t					\
 nvs_xdr_nvp_##type(XDR *xdrs, void *ptr, ...)	\
 {						\
 	return (xdr_##type(xdrs, ptr));		\
 }
 
 #endif
 
-/* BEGIN CSTYLED */
 NVS_BUILD_XDRPROC_T(char);
 NVS_BUILD_XDRPROC_T(short);
 NVS_BUILD_XDRPROC_T(u_short);
 NVS_BUILD_XDRPROC_T(int);
 NVS_BUILD_XDRPROC_T(u_int);
 NVS_BUILD_XDRPROC_T(longlong_t);
 NVS_BUILD_XDRPROC_T(u_longlong_t);
-/* END CSTYLED */
 
 /*
  * The format of xdr encoded nvpair is:
  * encode_size, decode_size, name string, data type, nelem, data
  */
 static int
 nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 {
 	ASSERT(nvs != NULL && nvp != NULL);
 
 	data_type_t type;
 	char	*buf;
 	char	*buf_end = (char *)nvp + nvp->nvp_size;
 	int	value_sz;
 	uint_t	nelem, buflen;
 	bool_t	ret = FALSE;
 	XDR	*xdr = nvs->nvs_private;
 
 	ASSERT(xdr != NULL);
 
 	/* name string */
 	if ((buf = NVP_NAME(nvp)) >= buf_end)
 		return (EFAULT);
 	buflen = buf_end - buf;
 
 	if (!xdr_string(xdr, &buf, buflen - 1))
 		return (EFAULT);
 	nvp->nvp_name_sz = strlen(buf) + 1;
 
 	/* type and nelem */
 	if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
 	    !xdr_int(xdr, &nvp->nvp_value_elem))
 		return (EFAULT);
 
 	type = NVP_TYPE(nvp);
 	nelem = nvp->nvp_value_elem;
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
 	if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
 		return (EFAULT);
 
 	/* if there is no data to extract then return */
 	if (nelem == 0)
 		return (0);
 
 	/* value */
 	if ((buf = NVP_VALUE(nvp)) >= buf_end)
 		return (EFAULT);
 	buflen = buf_end - buf;
 
 	if (buflen < value_sz)
 		return (EFAULT);
 
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		if (nvs_embedded(nvs, (void *)buf) == 0)
 			return (0);
 		break;
 
 	case DATA_TYPE_NVLIST_ARRAY:
 		if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
 			return (0);
 		break;
 
 	case DATA_TYPE_BOOLEAN:
 		ret = TRUE;
 		break;
 
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 		ret = xdr_char(xdr, buf);
 		break;
 
 	case DATA_TYPE_INT16:
 		ret = xdr_short(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT16:
 		ret = xdr_u_short(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_INT32:
 		ret = xdr_int(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT32:
 		ret = xdr_u_int(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_INT64:
 		ret = xdr_longlong_t(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT64:
 		ret = xdr_u_longlong_t(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_HRTIME:
 		/*
 		 * NOTE: must expose the definition of hrtime_t here
 		 */
 		ret = xdr_longlong_t(xdr, (void *)buf);
 		break;
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 		ret = xdr_double(xdr, (void *)buf);
 		break;
 #endif
 	case DATA_TYPE_STRING:
 		ret = xdr_string(xdr, &buf, buflen - 1);
 		break;
 
 	case DATA_TYPE_BYTE_ARRAY:
 		ret = xdr_opaque(xdr, buf, nelem);
 		break;
 
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
 		    nvs_xdr_nvp_char);
 		break;
 
 	case DATA_TYPE_INT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
 		    sizeof (int16_t), nvs_xdr_nvp_short);
 		break;
 
 	case DATA_TYPE_UINT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
 		    sizeof (uint16_t), nvs_xdr_nvp_u_short);
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
 		    sizeof (int32_t), nvs_xdr_nvp_int);
 		break;
 
 	case DATA_TYPE_UINT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
 		    sizeof (uint32_t), nvs_xdr_nvp_u_int);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
 		    sizeof (int64_t), nvs_xdr_nvp_longlong_t);
 		break;
 
 	case DATA_TYPE_UINT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
 		    sizeof (uint64_t), nvs_xdr_nvp_u_longlong_t);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
 		size_t len = nelem * sizeof (uint64_t);
 		char **strp = (void *)buf;
 		int i;
 
 		if (nvs->nvs_op == NVS_OP_DECODE)
 			memset(buf, 0, len);	/* don't trust packed data */
 
 		for (i = 0; i < nelem; i++) {
 			if (buflen <= len)
 				return (EFAULT);
 
 			buf += len;
 			buflen -= len;
 
 			if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
 				return (EFAULT);
 
 			if (nvs->nvs_op == NVS_OP_DECODE)
 				strp[i] = buf;
 			len = strlen(buf) + 1;
 		}
 		ret = TRUE;
 		break;
 	}
 	default:
 		break;
 	}
 
 	return (ret == TRUE ? 0 : EFAULT);
 }
 
 static int
 nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	data_type_t type = NVP_TYPE(nvp);
 	/*
 	 * encode_size + decode_size + name string size + data type + nelem
 	 * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
 	 */
 	uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
 
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 		nvp_sz += 4;	/* 4 is the minimum xdr unit */
 		break;
 
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_HRTIME:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		nvp_sz += 8;
 		break;
 
 	case DATA_TYPE_STRING:
 		nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
 		break;
 
 	case DATA_TYPE_BYTE_ARRAY:
 		nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 		nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 		nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
 		int i;
 		char **strs = (void *)NVP_VALUE(nvp);
 
 		for (i = 0; i < NVP_NELEM(nvp); i++)
 			nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
 
 		break;
 	}
 
 	case DATA_TYPE_NVLIST:
 	case DATA_TYPE_NVLIST_ARRAY: {
 		size_t nvsize = 0;
 		int old_nvs_op = nvs->nvs_op;
 		int err;
 
 		nvs->nvs_op = NVS_OP_GETSIZE;
 		if (type == DATA_TYPE_NVLIST)
 			err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
 		else
 			err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
 		nvs->nvs_op = old_nvs_op;
 
 		if (err != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 
 	if (nvp_sz > INT32_MAX)
 		return (EINVAL);
 
 	*size = nvp_sz;
 
 	return (0);
 }
 
 
 /*
  * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
  * the largest nvpair that could be encoded in the buffer.
  *
  * See comments above nvpair_xdr_op() for the format of xdr encoding.
  * The size of a xdr packed nvpair without any data is 5 words.
  *
  * Using the size of the data directly as an estimate would be ok
  * in all cases except one.  If the data type is of DATA_TYPE_STRING_ARRAY
  * then the actual nvpair has space for an array of pointers to index
  * the strings.  These pointers are not encoded into the packed xdr buffer.
  *
  * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
  * of length 0, then each string is encoded in xdr format as a single word.
  * Therefore when expanded to an nvpair there will be 2.25 word used for
  * each string.  (a int64_t allocated for pointer usage, and a single char
  * for the null termination.)
  *
  * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
  */
 #define	NVS_XDR_HDR_LEN		((size_t)(5 * 4))
 #define	NVS_XDR_DATA_LEN(y)	(((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
 					0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
 #define	NVS_XDR_MAX_LEN(x)	(NVP_SIZE_CALC(1, 0) + \
 					(NVS_XDR_DATA_LEN(x) * 2) + \
 					NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
 
 static int
 nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	XDR 	*xdr = nvs->nvs_private;
 	int32_t	encode_len, decode_len;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE: {
 		size_t nvsize;
 
 		if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
 			return (EFAULT);
 
 		decode_len = nvp->nvp_size;
 		encode_len = nvsize;
 		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
 			return (EFAULT);
 
 		return (nvs_xdr_nvp_op(nvs, nvp));
 	}
 	case NVS_OP_DECODE: {
 		struct xdr_bytesrec bytesrec;
 
 		/* get the encode and decode size */
 		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
 			return (EFAULT);
 		*size = decode_len;
 
 		/* are we at the end of the stream? */
 		if (*size == 0)
 			return (0);
 
 		/* sanity check the size parameter */
 		if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
 			return (EFAULT);
 
 		if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
 			return (EFAULT);
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static const struct nvs_ops nvs_xdr_ops = {
 	.nvs_nvlist = nvs_xdr_nvlist,
 	.nvs_nvpair = nvs_xdr_nvpair,
 	.nvs_nvp_op = nvs_xdr_nvp_op,
 	.nvs_nvp_size = nvs_xdr_nvp_size,
 	.nvs_nvl_fini = nvs_xdr_nvl_fini
 };
 
 static int
 nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
 {
 	XDR xdr;
 	int err;
 
 	nvs->nvs_ops = &nvs_xdr_ops;
 
 	if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
 	    *buflen - sizeof (nvs_header_t))) != 0)
 		return (err);
 
 	err = nvs_operation(nvs, nvl, buflen);
 
 	nvs_xdr_destroy(nvs);
 
 	return (err);
 }
 
 EXPORT_SYMBOL(nv_alloc_init);
 EXPORT_SYMBOL(nv_alloc_reset);
 EXPORT_SYMBOL(nv_alloc_fini);
 
 /* list management */
 EXPORT_SYMBOL(nvlist_alloc);
 EXPORT_SYMBOL(nvlist_free);
 EXPORT_SYMBOL(nvlist_size);
 EXPORT_SYMBOL(nvlist_pack);
 EXPORT_SYMBOL(nvlist_unpack);
 EXPORT_SYMBOL(nvlist_dup);
 EXPORT_SYMBOL(nvlist_merge);
 
 EXPORT_SYMBOL(nvlist_xalloc);
 EXPORT_SYMBOL(nvlist_xpack);
 EXPORT_SYMBOL(nvlist_xunpack);
 EXPORT_SYMBOL(nvlist_xdup);
 EXPORT_SYMBOL(nvlist_lookup_nv_alloc);
 
 EXPORT_SYMBOL(nvlist_add_nvpair);
 EXPORT_SYMBOL(nvlist_add_boolean);
 EXPORT_SYMBOL(nvlist_add_boolean_value);
 EXPORT_SYMBOL(nvlist_add_byte);
 EXPORT_SYMBOL(nvlist_add_int8);
 EXPORT_SYMBOL(nvlist_add_uint8);
 EXPORT_SYMBOL(nvlist_add_int16);
 EXPORT_SYMBOL(nvlist_add_uint16);
 EXPORT_SYMBOL(nvlist_add_int32);
 EXPORT_SYMBOL(nvlist_add_uint32);
 EXPORT_SYMBOL(nvlist_add_int64);
 EXPORT_SYMBOL(nvlist_add_uint64);
 EXPORT_SYMBOL(nvlist_add_string);
 EXPORT_SYMBOL(nvlist_add_nvlist);
 EXPORT_SYMBOL(nvlist_add_boolean_array);
 EXPORT_SYMBOL(nvlist_add_byte_array);
 EXPORT_SYMBOL(nvlist_add_int8_array);
 EXPORT_SYMBOL(nvlist_add_uint8_array);
 EXPORT_SYMBOL(nvlist_add_int16_array);
 EXPORT_SYMBOL(nvlist_add_uint16_array);
 EXPORT_SYMBOL(nvlist_add_int32_array);
 EXPORT_SYMBOL(nvlist_add_uint32_array);
 EXPORT_SYMBOL(nvlist_add_int64_array);
 EXPORT_SYMBOL(nvlist_add_uint64_array);
 EXPORT_SYMBOL(nvlist_add_string_array);
 EXPORT_SYMBOL(nvlist_add_nvlist_array);
 EXPORT_SYMBOL(nvlist_next_nvpair);
 EXPORT_SYMBOL(nvlist_prev_nvpair);
 EXPORT_SYMBOL(nvlist_empty);
 EXPORT_SYMBOL(nvlist_add_hrtime);
 
 EXPORT_SYMBOL(nvlist_remove);
 EXPORT_SYMBOL(nvlist_remove_nvpair);
 EXPORT_SYMBOL(nvlist_remove_all);
 
 EXPORT_SYMBOL(nvlist_lookup_boolean);
 EXPORT_SYMBOL(nvlist_lookup_boolean_value);
 EXPORT_SYMBOL(nvlist_lookup_byte);
 EXPORT_SYMBOL(nvlist_lookup_int8);
 EXPORT_SYMBOL(nvlist_lookup_uint8);
 EXPORT_SYMBOL(nvlist_lookup_int16);
 EXPORT_SYMBOL(nvlist_lookup_uint16);
 EXPORT_SYMBOL(nvlist_lookup_int32);
 EXPORT_SYMBOL(nvlist_lookup_uint32);
 EXPORT_SYMBOL(nvlist_lookup_int64);
 EXPORT_SYMBOL(nvlist_lookup_uint64);
 EXPORT_SYMBOL(nvlist_lookup_string);
 EXPORT_SYMBOL(nvlist_lookup_nvlist);
 EXPORT_SYMBOL(nvlist_lookup_boolean_array);
 EXPORT_SYMBOL(nvlist_lookup_byte_array);
 EXPORT_SYMBOL(nvlist_lookup_int8_array);
 EXPORT_SYMBOL(nvlist_lookup_uint8_array);
 EXPORT_SYMBOL(nvlist_lookup_int16_array);
 EXPORT_SYMBOL(nvlist_lookup_uint16_array);
 EXPORT_SYMBOL(nvlist_lookup_int32_array);
 EXPORT_SYMBOL(nvlist_lookup_uint32_array);
 EXPORT_SYMBOL(nvlist_lookup_int64_array);
 EXPORT_SYMBOL(nvlist_lookup_uint64_array);
 EXPORT_SYMBOL(nvlist_lookup_string_array);
 EXPORT_SYMBOL(nvlist_lookup_nvlist_array);
 EXPORT_SYMBOL(nvlist_lookup_hrtime);
 EXPORT_SYMBOL(nvlist_lookup_pairs);
 
 EXPORT_SYMBOL(nvlist_lookup_nvpair);
 EXPORT_SYMBOL(nvlist_exists);
 
 /* processing nvpair */
 EXPORT_SYMBOL(nvpair_name);
 EXPORT_SYMBOL(nvpair_type);
 EXPORT_SYMBOL(nvpair_value_boolean_value);
 EXPORT_SYMBOL(nvpair_value_byte);
 EXPORT_SYMBOL(nvpair_value_int8);
 EXPORT_SYMBOL(nvpair_value_uint8);
 EXPORT_SYMBOL(nvpair_value_int16);
 EXPORT_SYMBOL(nvpair_value_uint16);
 EXPORT_SYMBOL(nvpair_value_int32);
 EXPORT_SYMBOL(nvpair_value_uint32);
 EXPORT_SYMBOL(nvpair_value_int64);
 EXPORT_SYMBOL(nvpair_value_uint64);
 EXPORT_SYMBOL(nvpair_value_string);
 EXPORT_SYMBOL(nvpair_value_nvlist);
 EXPORT_SYMBOL(nvpair_value_boolean_array);
 EXPORT_SYMBOL(nvpair_value_byte_array);
 EXPORT_SYMBOL(nvpair_value_int8_array);
 EXPORT_SYMBOL(nvpair_value_uint8_array);
 EXPORT_SYMBOL(nvpair_value_int16_array);
 EXPORT_SYMBOL(nvpair_value_uint16_array);
 EXPORT_SYMBOL(nvpair_value_int32_array);
 EXPORT_SYMBOL(nvpair_value_uint32_array);
 EXPORT_SYMBOL(nvpair_value_int64_array);
 EXPORT_SYMBOL(nvpair_value_uint64_array);
 EXPORT_SYMBOL(nvpair_value_string_array);
 EXPORT_SYMBOL(nvpair_value_nvlist_array);
 EXPORT_SYMBOL(nvpair_value_hrtime);
diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c
index 4b9cc65d641e..0a2fcf110d7b 100644
--- a/module/os/freebsd/spl/spl_dtrace.c
+++ b/module/os/freebsd/spl/spl_dtrace.c
@@ -1,35 +1,34 @@
 /*
  * Copyright 2014 The FreeBSD Project.
  * All rights reserved.
  *
  * This software was developed by Steven Hartland.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 
-/* CSTYLED */
 SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c
index c84cb7407a9c..7350b8a6d49f 100644
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -1,886 +1,804 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/arc_os.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
 #include <sys/dsl_crypt.h>
 
 #include <sys/zfs_ioctl_compat.h>
 #include <sys/zfs_context.h>
 
 #include <sys/arc_impl.h>
 #include <sys/dsl_pool.h>
 
 #include <sys/vmmeter.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0,
 	"ZFS adaptive replacement cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0,
 	"ZFS Block Reference Table");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0,
 	"ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0,
 	"ZFS multihost protection");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 
 SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
 	"ZFS livelist condense");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
 	"ZFS VDEV mirror");
 
 SYSCTL_DECL(_vfs_zfs_version);
 SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
 	(ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
 
 /* arc.c */
 
 int
 param_set_arc_u64(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_int(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_int(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_max;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min ||
 	    val >= arc_all_memory()))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_max = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_max, "LU",
 	"Maximum ARC size in bytes (LEGACY)");
-/* END CSTYLED */
 
 int
 param_set_arc_min(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_min;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_min = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
 	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_min, "LU",
 	"Minimum ARC size in bytes (LEGACY)");
-/* END CSTYLED */
 
 extern uint_t zfs_arc_free_target;
 
 int
 param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 {
 	uint_t val;
 	int err;
 
 	val = zfs_arc_free_target;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < minfree)
 		return (EINVAL);
 	if (val > vm_cnt.v_page_count)
 		return (EINVAL);
 
 	zfs_arc_free_target = val;
 
 	return (0);
 }
 
 /*
  * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on
  * pagedaemon initialization.
  */
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
 	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_free_target, "IU",
 	"Desired number of free pages below which ARC triggers reclaim"
 	" (LEGACY)");
-/* END CSTYLED */
 
 int
 param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = arc_no_grow_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val >= arc_shrink_shift)
 		return (EINVAL);
 
 	arc_no_grow_shift = val;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
 	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	NULL, 0, param_set_arc_no_grow_shift, "I",
 	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_write_max;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max,
 	CTLFLAG_RWTUN, &l2arc_write_max, 0,
 	"Max write bytes per interval (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_write_boost;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost,
 	CTLFLAG_RWTUN, &l2arc_write_boost, 0,
 	"Extra write bytes during device warmup (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_headroom;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom,
 	CTLFLAG_RWTUN, &l2arc_headroom, 0,
 	"Number of max device writes to precache (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_headroom_boost;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost,
 	CTLFLAG_RWTUN, &l2arc_headroom_boost, 0,
 	"Compressed l2arc_headroom multiplier (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_feed_secs;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs,
 	CTLFLAG_RWTUN, &l2arc_feed_secs, 0,
 	"Seconds between L2ARC writing (LEGACY)");
-/* END CSTYLED */
 
 extern uint64_t l2arc_feed_min_ms;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms,
 	CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0,
 	"Min feed interval in milliseconds (LEGACY)");
-/* END CSTYLED */
 
 extern int l2arc_noprefetch;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch,
 	CTLFLAG_RWTUN, &l2arc_noprefetch, 0,
 	"Skip caching prefetched buffers (LEGACY)");
-/* END CSTYLED */
 
 extern int l2arc_feed_again;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again,
 	CTLFLAG_RWTUN, &l2arc_feed_again, 0,
 	"Turbo L2ARC warmup (LEGACY)");
-/* END CSTYLED */
 
 extern int l2arc_norw;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw,
 	CTLFLAG_RWTUN, &l2arc_norw, 0,
 	"No reads during writes (LEGACY)");
-/* END CSTYLED */
 
 static int
 param_get_arc_state_size(SYSCTL_HANDLER_ARGS)
 {
 	arc_state_t *state = (arc_state_t *)arg1;
 	int64_t val;
 
 	val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 extern arc_state_t ARC_anon;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_anon, 0, param_get_arc_state_size, "Q",
 	"size of anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in anonymous state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
 	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in anonymous state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_mru;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mru, 0, param_get_arc_state_size, "Q",
 	"size of mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
 	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mru state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
 	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mru state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_mru_ghost;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mru_ghost, 0, param_get_arc_state_size, "Q",
 	"size of mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
 	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mru ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
 	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mru ghost state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_mfu;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mfu, 0, param_get_arc_state_size, "Q",
 	"size of mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
 	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mfu state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
 	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mfu state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_mfu_ghost;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_mfu_ghost, 0, param_get_arc_state_size, "Q",
 	"size of mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
 	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in mfu ghost state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
 	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in mfu ghost state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_uncached;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_uncached, 0, param_get_arc_state_size, "Q",
 	"size of uncached state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
 	&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
 	"size of evictable metadata in uncached state");
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
 	&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
 	"size of evictable data in uncached state");
-/* END CSTYLED */
 
 extern arc_state_t ARC_l2c_only;
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size,
 	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	&ARC_l2c_only, 0, param_get_arc_state_size, "Q",
 	"size of l2c_only state");
-/* END CSTYLED */
 
 /* dbuf.c */
 
 /* dmu.c */
 
 /* dmu_zfetch.c */
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
 
 extern uint32_t	zfetch_max_distance;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance,
 	CTLFLAG_RWTUN, &zfetch_max_distance, 0,
 	"Max bytes to prefetch per stream (LEGACY)");
-/* END CSTYLED */
 
 extern uint32_t	zfetch_max_idistance;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
 	CTLFLAG_RWTUN, &zfetch_max_idistance, 0,
 	"Max bytes to prefetch indirects for per stream (LEGACY)");
-/* END CSTYLED */
 
 /* dsl_pool.c */
 
 /* dnode.c */
 
 /* dsl_scan.c */
 
 /* metaslab.c */
 
 int
 param_set_active_allocator(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_active_allocator, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_active_allocator) == 0)
 		return (0);
 
 	return (param_set_active_allocator_common(buf));
 }
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 extern int zfs_metaslab_sm_blksz_no_log;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0,
 	"Block size for space map in pools with log space map disabled.  "
 	"Power of 2 greater than 4096.");
-/* END CSTYLED */
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 extern int zfs_metaslab_sm_blksz_with_log;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0,
 	"Block size for space map in pools with log space map enabled.  "
 	"Power of 2 greater than 4096.");
-/* END CSTYLED */
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 extern uint_t zfs_condense_pct;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
 	CTLFLAG_RWTUN, &zfs_condense_pct, 0,
 	"Condense on-disk spacemap when it is more than this many percents"
 	" of in-memory counterpart");
-/* END CSTYLED */
 
 extern uint_t zfs_remove_max_segment;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment,
 	CTLFLAG_RWTUN, &zfs_remove_max_segment, 0,
 	"Largest contiguous segment ZFS will attempt to allocate when removing"
 	" a device");
-/* END CSTYLED */
 
 extern int zfs_removal_suspend_progress;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress,
 	CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0,
 	"Ensures certain actions can happen while in the middle of a removal");
-/* END CSTYLED */
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 extern uint64_t metaslab_df_alloc_threshold;
 
-/* BEGIN CSTYLED */
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold,
 	CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0,
 	"Minimum size which forces the dynamic allocator to change its"
 	" allocation strategy");
-/* END CSTYLED */
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 extern uint_t metaslab_df_free_pct;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
 	CTLFLAG_RWTUN, &metaslab_df_free_pct, 0,
 	"The minimum free space, in percent, which must be available in a"
 	" space map to continue allocations in a first-fit fashion");
-/* END CSTYLED */
 
 /* mmp.c */
 
 int
 param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (spa_mode_global != SPA_MODE_UNINIT)
 		mmp_signal_all_threads();
 
 	return (0);
 }
 
 /* spa.c */
 
 extern int zfs_ccw_retry_interval;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval,
 	CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0,
 	"Configuration cache file write, retry after failure, interval"
 	" (seconds)");
-/* END CSTYLED */
 
 extern uint64_t zfs_max_missing_tvds_cachefile;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0,
 	"Allow importing pools with missing top-level vdevs in cache file");
-/* END CSTYLED */
 
 extern uint64_t zfs_max_missing_tvds_scan;
 
-/* BEGIN CSTYLED */
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0,
 	"Allow importing pools with missing top-level vdevs during scan");
-/* END CSTYLED */
 
 /* spa_misc.c */
 
 extern int zfs_flags;
 
 static int
 sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = zfs_flags;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	/*
 	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
 	 * arc buffers in the system have the necessary additional
 	 * checksum data.  However, it is safe to disable at any
 	 * time.
 	 */
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		val &= ~ZFS_DEBUG_MODIFY;
 	zfs_flags = val;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
 	CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
 	sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
-/* END CSTYLED */
 
 int
 param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_synctime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_synctime_ms = val;
 
 	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_ziotime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_ziotime_ms = val;
 
 	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_deadman_failmode) == 0)
 		return (0);
 	if (strcmp(buf, "wait") == 0)
 		zfs_deadman_failmode = "wait";
 	if (strcmp(buf, "continue") == 0)
 		zfs_deadman_failmode = "continue";
 	if (strcmp(buf, "panic") == 0)
 		zfs_deadman_failmode = "panic";
 
 	return (-param_set_deadman_failmode_common(buf));
 }
 
 int
 param_set_slop_shift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = spa_slop_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 1 || val > 31)
 		return (EINVAL);
 
 	spa_slop_shift = val;
 
 	return (0);
 }
 
 /* spacemap.c */
 
 extern int space_map_ibs;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
 	&space_map_ibs, 0, "Space map indirect block shift");
-/* END CSTYLED */
 
 
 /* vdev.c */
 
 int
 param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_min_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_min_auto_ashift = val;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
 	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	&zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
 	param_set_min_auto_ashift, "IU",
 	"Min ashift used when creating new top-level vdev. (LEGACY)");
-/* END CSTYLED */
 
 int
 param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_max_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_max_auto_ashift = val;
 
 	return (0);
 }
 
-/* BEGIN CSTYLED */
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
 	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	&zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
 	param_set_max_auto_ashift, "IU",
 	"Max ashift used when optimizing for logical -> physical sector size on"
 	" new top-level vdevs. (LEGACY)");
-/* END CSTYLED */
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 extern int zfs_vdev_dtl_sm_blksz;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0,
 	"Block size for DTL space map.  Power of 2 greater than 4096.");
-/* END CSTYLED */
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 extern int zfs_vdev_standard_sm_blksz;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0,
 	"Block size for standard space map.  Power of 2 greater than 4096.");
-/* END CSTYLED */
 
 extern int vdev_validate_skip;
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
 	CTLFLAG_RDTUN, &vdev_validate_skip, 0,
 	"Enable to bypass vdev_validate().");
-/* END CSTYLED */
 
 /* vdev_mirror.c */
 
 /* vdev_queue.c */
 
 extern uint_t zfs_vdev_max_active;
 
-/* BEGIN CSTYLED */
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight,
 	CTLFLAG_RWTUN, &zfs_vdev_max_active, 0,
 	"The maximum number of I/Os of all types active for each device."
 	" (LEGACY)");
-/* END CSTYLED */
 
 /* zio.c */
 
-/* BEGIN CSTYLED */
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata,
 	CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
 	"Exclude metadata buffers from dumps as well");
-/* END CSTYLED */
diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c
index feaca93fb933..195ac58f6f1a 100644
--- a/module/os/freebsd/zfs/zio_crypt.c
+++ b/module/os/freebsd/zfs/zio_crypt.c
@@ -1,1830 +1,1829 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 static void
 zio_crypt_key_destroy_early(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	memset(&key->zk_session, 0, sizeof (key->zk_session));
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 
 	freebsd_crypt_freesession(&key->zk_session);
 	zio_crypt_key_destroy_early(key);
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech __unused;
 	uint_t keydata_len;
 	const zio_crypt_info_t *ci = NULL;
 
 	ASSERT3P(key, !=, NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	ret = freebsd_crypt_newsession(&key->zk_session, ci,
 	    &key->zk_current_key);
 	if (ret)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech __unused;
 
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	freebsd_crypt_freesession(&key->zk_session);
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto out_unlock;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int failed_decrypt_size;
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 /*
  * The implementation for FreeBSD's OpenCrypto.
  *
  * The big difference between ICP and FOC is that FOC uses a single
  * buffer for input and output.  This means that (for AES-GCM, the
  * only one supported right now) the source must be copied into the
  * destination, and the destination must have the AAD, and the tag/MAC,
  * already associated with it.  (Both implementations can use a uio.)
  *
  * Since the auth data is part of the iovec array, all we need to know
  * is the length:  0 means there's no AAD.
  *
  */
 static int
 zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
     uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *uio, uint_t auth_len)
 {
 	const zio_crypt_info_t *ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 
 	int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
 	    datalen, auth_len);
 	if (ret != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d):  Returning error %s\n",
 		    __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
 #endif
 		ret = SET_ERROR(encrypt ? EIO : ECKSUM);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the plain text (source) to the cipher buffer (dest).
 	 * We set iovecs[0] -- the authentication data -- below.
 	 */
 	memcpy(keydata_out, key->zk_master_keydata, keydata_len);
 	memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	iovecs[1].iov_base = keydata_out;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = hmac_keydata_out;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	void *src, *dst;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the encrypted buffer (source) to the plain buffer
 	 * (dest).  We set iovecs[0] -- the authentication data --
 	 * below.
 	 */
 	dst = key->zk_master_keydata;
 	src = keydata;
 	memcpy(dst, src, keydata_len);
 
 	dst = key->zk_hmac_keydata;
 	src = hmac_keydata;
 	memcpy(dst, src, SHA512_HMAC_KEYLEN);
 
 	iovecs[1].iov_base = key->zk_master_keydata;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = key->zk_hmac_keydata;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	crypto_mac(&key->zk_hmac_key, data, datalen,
 	    raw_digestbuf, SHA512_DIGEST_LENGTH);
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	int avoidlint = SPA_MINBLOCKSIZE;
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, avoidlint);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, avoidlint);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	crypto_mac_update(ctx, &bab, bab_len);
 
 	return (0);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
 
 	/* authenticate the core dnode (masking out non-portable bits) */
 	memcpy(tmp_dncore, dnp, sizeof (tmp_dncore));
 	adnp = (dnode_phys_t *)tmp_dncore;
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	struct hmac_ctx hash_ctx;
 	struct hmac_ctx *ctx = &hash_ctx;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* XXX check dnode type ... */
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (GET_UIO_STRUCT(uio)->uio_iov)
 		kmem_free(GET_UIO_STRUCT(uio)->uio_iov,
 		    zfs_uio_iovcnt(uio) * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
 #endif
 		return (SET_ERROR(ECKSUM));
 	}
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 /*
  * The OpenCrypto used in FreeBSD does not use separate source and
  * destination buffers; instead, the same buffer is used.  Further, to
  * accommodate some of the drivers, the authbuf needs to be logically before
  * the data.  This means that we need to copy the source to the destination,
  * and set up an extra iovec_t at the beginning to handle the authbuf.
  * It also means we'll only return one zfs_uio_t.
  */
 
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	(void) puio;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	iovec_t *dst_iovecs;
 	zil_chain_t *zilc;
 	lr_t *lr;
 	uint64_t txtype, lr_len, nused;
 	uint_t crypt_len, nr_iovecs, vec;
 	uint_t aad_len = 0, total_len = 0;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	/* Find the start and end record of the log block. */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
 	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
 	ASSERT3U(nused, >=, sizeof (zil_chain_t));
 	ASSERT3U(nused, <=, datalen);
 	blkend = src + nused;
 
 	/*
 	 * Calculate the number of encrypted iovecs we will need.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (byteswap) {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		} else {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		}
 		ASSERT3U(lr_len, >=, sizeof (lr_t));
 		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	/*
 	 * Loop over records again, filling in iovecs.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			const size_t o = offsetof(lr_write_t, lr_blkptr);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + o, slrp + o, sizeof (blkptr_t));
 			memcpy(aadp, slrp + o, sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			vec++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				dst_iovecs[vec].iov_base = (char *)
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[vec].iov_len = crypt_len;
 				vec++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			vec++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 			vec++;
 			total_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	iovec_t *dst_iovecs;
 	uint_t nr_iovecs, crypt_len, vec;
 	uint_t aad_len = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			vec++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio,
     uint_t *enc_len)
 {
 	(void) puio;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 	void *src, *dst;
 
 	cipher_iovecs = kmem_zalloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 	cipher_iovecs[0].iov_base = dst;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	GET_UIO_STRUCT(out_uio)->uio_iov = NULL;
 	zfs_uio_iovcnt(out_uio) = 0;
 
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	zfs_uio_segflg(cuio) = UIO_SYSSPACE;
 
 	mac_iov =
 	    ((iovec_t *)&(GET_UIO_STRUCT(cuio)->
 	    uio_iov[zfs_uio_iovcnt(cuio) - 1]));
 	mac_iov->iov_base = (void *)mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int faile_decrypt_size;
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	struct uio puio_s, cuio_s;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	freebsd_crypt_session_t *tmpl = NULL;
 	uint8_t *authbuf = NULL;
 
 	memset(&puio_s, 0, sizeof (puio_s));
 	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&puio, &puio_s);
 	zfs_uio_init(&cuio, &cuio_s);
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
 	    __FUNCTION__,
 	    encrypt ? "encrypt" : "decrypt",
 	    key, salt, ot, iv, mac, datalen,
 	    byteswap ? "byteswap" : "native_endian", plainbuf,
 	    cipherbuf, no_crypt);
 
 	printf("\tkey = {");
 	for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
 		printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
 	printf("}\n");
 #endif
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		return (ret);
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = &key->zk_session;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/* perform the encryption / decryption */
 	ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
 	    ckey, iv, enc_len, &cuio, auth_len);
 	if (ret != 0)
 		goto error;
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (!encrypt) {
 		if (failed_decrypt_buf != NULL)
 			kmem_free(failed_decrypt_buf, failed_decrypt_size);
 		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
 		failed_decrypt_size = datalen;
 		memcpy(failed_decrypt_buf, cipherbuf, datalen);
 	}
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 	return (SET_ERROR(ret));
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (SET_ERROR(ret));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
-/* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c
index 29781b9515b2..81e520547dd7 100644
--- a/module/os/linux/spl/spl-err.c
+++ b/module/os/linux/spl/spl-err.c
@@ -1,124 +1,123 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Error Implementation.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 
 /*
  * It is often useful to actually have the panic crash the node so you
  * can then get notified of the event, get the crashdump for later
  * analysis and other such goodies.
  * But we would still default to the current default of not to do that.
  */
 static unsigned int spl_panic_halt;
-/* CSTYLED */
 module_param(spl_panic_halt, uint, 0644);
 MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
 
 void
 spl_dumpstack(void)
 {
 	printk("Showing stack for process %d\n", current->pid);
 	dump_stack();
 }
 EXPORT_SYMBOL(spl_dumpstack);
 
 void
 spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
 	char msg[MAXMSGLEN];
 	va_list ap;
 
 	newfile = strrchr(file, '/');
 	if (newfile != NULL)
 		newfile = newfile + 1;
 	else
 		newfile = file;
 
 	va_start(ap, fmt);
 	(void) vsnprintf(msg, sizeof (msg), fmt, ap);
 	va_end(ap);
 
 	printk(KERN_EMERG "%s", msg);
 	printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
 	if (spl_panic_halt)
 		panic("%s", msg);
 
 	spl_dumpstack();
 
 	/* Halt the thread to facilitate further debugging */
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	while (1)
 		schedule();
 
 	/* Unreachable */
 }
 EXPORT_SYMBOL(spl_panic);
 
 void
 vcmn_err(int ce, const char *fmt, va_list ap)
 {
 	char msg[MAXMSGLEN];
 
 	vsnprintf(msg, MAXMSGLEN, fmt, ap);
 
 	switch (ce) {
 	case CE_IGNORE:
 		break;
 	case CE_CONT:
 		printk("%s", msg);
 		break;
 	case CE_NOTE:
 		printk(KERN_NOTICE "NOTICE: %s\n", msg);
 		break;
 	case CE_WARN:
 		printk(KERN_WARNING "WARNING: %s\n", msg);
 		break;
 	case CE_PANIC:
 		printk(KERN_EMERG "PANIC: %s\n", msg);
 		if (spl_panic_halt)
 			panic("%s", msg);
 
 		spl_dumpstack();
 
 		/* Halt the thread to facilitate further debugging */
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		while (1)
 			schedule();
 	}
 } /* vcmn_err() */
 EXPORT_SYMBOL(vcmn_err);
 
 void
 cmn_err(int ce, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vcmn_err(ce, fmt, ap);
 	va_end(ap);
 } /* cmn_err() */
 EXPORT_SYMBOL(cmn_err);
diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c
index 6a95d77ac278..e13914221a6a 100644
--- a/module/os/linux/spl/spl-generic.c
+++ b/module/os/linux/spl/spl-generic.c
@@ -1,903 +1,902 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Generic Implementation.
  */
 
 #include <sys/isa_defs.h>
 #include <sys/sysmacros.h>
 #include <sys/systeminfo.h>
 #include <sys/vmsystm.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/taskq.h>
 #include <sys/tsd.h>
 #include <sys/zmod.h>
 #include <sys/debug.h>
 #include <sys/proc.h>
 #include <sys/kstat.h>
 #include <sys/file.h>
 #include <sys/sunddi.h>
 #include <linux/ctype.h>
 #include <sys/disp.h>
 #include <sys/random.h>
 #include <sys/string.h>
 #include <linux/kmod.h>
 #include <linux/mod_compat.h>
 #include <sys/cred.h>
 #include <sys/vnode.h>
 #include <sys/misc.h>
 #include <linux/mod_compat.h>
 
 unsigned long spl_hostid = 0;
 EXPORT_SYMBOL(spl_hostid);
 
-/* CSTYLED */
 module_param(spl_hostid, ulong, 0644);
 MODULE_PARM_DESC(spl_hostid, "The system hostid.");
 
 proc_t p0;
 EXPORT_SYMBOL(p0);
 
 /*
  * xoshiro256++ 1.0 PRNG by David Blackman and Sebastiano Vigna
  *
  * "Scrambled Linear Pseudorandom Number Generators∗"
  * https://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf
  *
  * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
  * is to provide bytes containing random numbers. It is mapped to /dev/urandom
  * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
  * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
  * we can implement it using a fast PRNG that we seed using Linux' actual
  * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
  * with an independent seed so that all calls to random_get_pseudo_bytes() are
  * free of atomic instructions.
  *
  * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
  * to generate words larger than 256 bits will paradoxically be limited to
  * `2^256 - 1` possibilities. This is because we have a sequence of `2^256 - 1`
  * 256-bit words and selecting the first will implicitly select the second. If
  * a caller finds this behavior undesirable, random_get_bytes() should be used
  * instead.
  *
  * XXX: Linux interrupt handlers that trigger within the critical section
  * formed by `s[3] = xp[3];` and `xp[0] = s[0];` and call this function will
  * see the same numbers. Nothing in the code currently calls this in an
  * interrupt handler, so this is considered to be okay. If that becomes a
  * problem, we could create a set of per-cpu variables for interrupt handlers
  * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
  * true.
  */
 static void __percpu *spl_pseudo_entropy;
 
 /*
  * rotl()/spl_rand_next()/spl_rand_jump() are copied from the following CC-0
  * licensed file:
  *
  * https://prng.di.unimi.it/xoshiro256plusplus.c
  */
 
 static inline uint64_t rotl(const uint64_t x, int k)
 {
 	return ((x << k) | (x >> (64 - k)));
 }
 
 static inline uint64_t
 spl_rand_next(uint64_t *s)
 {
 	const uint64_t result = rotl(s[0] + s[3], 23) + s[0];
 
 	const uint64_t t = s[1] << 17;
 
 	s[2] ^= s[0];
 	s[3] ^= s[1];
 	s[1] ^= s[2];
 	s[0] ^= s[3];
 
 	s[2] ^= t;
 
 	s[3] = rotl(s[3], 45);
 
 	return (result);
 }
 
 static inline void
 spl_rand_jump(uint64_t *s)
 {
 	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba,
 	    0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
 
 	uint64_t s0 = 0;
 	uint64_t s1 = 0;
 	uint64_t s2 = 0;
 	uint64_t s3 = 0;
 	int i, b;
 	for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
 		for (b = 0; b < 64; b++) {
 			if (JUMP[i] & 1ULL << b) {
 				s0 ^= s[0];
 				s1 ^= s[1];
 				s2 ^= s[2];
 				s3 ^= s[3];
 			}
 			(void) spl_rand_next(s);
 		}
 
 	s[0] = s0;
 	s[1] = s1;
 	s[2] = s2;
 	s[3] = s3;
 }
 
 int
 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
 {
 	uint64_t *xp, s[4];
 
 	ASSERT(ptr);
 
 	xp = get_cpu_ptr(spl_pseudo_entropy);
 
 	s[0] = xp[0];
 	s[1] = xp[1];
 	s[2] = xp[2];
 	s[3] = xp[3];
 
 	while (len) {
 		union {
 			uint64_t ui64;
 			uint8_t byte[sizeof (uint64_t)];
 		}entropy;
 		int i = MIN(len, sizeof (uint64_t));
 
 		len -= i;
 		entropy.ui64 = spl_rand_next(s);
 
 		/*
 		 * xoshiro256++ has low entropy lower bytes, so we copy the
 		 * higher order bytes first.
 		 */
 		while (i--)
 #ifdef _ZFS_BIG_ENDIAN
 			*ptr++ = entropy.byte[i];
 #else
 			*ptr++ = entropy.byte[7 - i];
 #endif
 	}
 
 	xp[0] = s[0];
 	xp[1] = s[1];
 	xp[2] = s[2];
 	xp[3] = s[3];
 
 	put_cpu_ptr(spl_pseudo_entropy);
 
 	return (0);
 }
 
 
 EXPORT_SYMBOL(random_get_pseudo_bytes);
 
 #if BITS_PER_LONG == 32
 
 /*
  * Support 64/64 => 64 division on a 32-bit platform.  While the kernel
  * provides a div64_u64() function for this we do not use it because the
  * implementation is flawed.  There are cases which return incorrect
  * results as late as linux-2.6.35.  Until this is fixed upstream the
  * spl must provide its own implementation.
  *
  * This implementation is a slightly modified version of the algorithm
  * proposed by the book 'Hacker's Delight'.  The original source can be
  * found here and is available for use without restriction.
  *
  * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
  */
 
 /*
  * Calculate number of leading of zeros for a 64-bit value.
  */
 static int
 nlz64(uint64_t x)
 {
 	register int n = 0;
 
 	if (x == 0)
 		return (64);
 
 	if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
 	if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
 	if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n +  8; x = x <<  8; }
 	if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n +  4; x = x <<  4; }
 	if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n +  2; x = x <<  2; }
 	if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n +  1; }
 
 	return (n);
 }
 
 /*
  * Newer kernels have a div_u64() function but we define our own
  * to simplify portability between kernel versions.
  */
 static inline uint64_t
 __div_u64(uint64_t u, uint32_t v)
 {
 	(void) do_div(u, v);
 	return (u);
 }
 
 /*
  * Turn off missing prototypes warning for these functions. They are
  * replacements for libgcc-provided functions and will never be called
  * directly.
  */
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wmissing-prototypes"
 #endif
 
 /*
  * Implementation of 64-bit unsigned division for 32-bit machines.
  *
  * First the procedure takes care of the case in which the divisor is a
  * 32-bit quantity. There are two subcases: (1) If the left half of the
  * dividend is less than the divisor, one execution of do_div() is all that
  * is required (overflow is not possible). (2) Otherwise it does two
  * divisions, using the grade school method.
  */
 uint64_t
 __udivdi3(uint64_t u, uint64_t v)
 {
 	uint64_t u0, u1, v1, q0, q1, k;
 	int n;
 
 	if (v >> 32 == 0) {			// If v < 2**32:
 		if (u >> 32 < v) {		// If u/v cannot overflow,
 			return (__div_u64(u, v)); // just do one division.
 		} else {			// If u/v would overflow:
 			u1 = u >> 32;		// Break u into two halves.
 			u0 = u & 0xFFFFFFFF;
 			q1 = __div_u64(u1, v);	// First quotient digit.
 			k  = u1 - q1 * v;	// First remainder, < v.
 			u0 += (k << 32);
 			q0 = __div_u64(u0, v);	// Seconds quotient digit.
 			return ((q1 << 32) + q0);
 		}
 	} else {				// If v >= 2**32:
 		n = nlz64(v);			// 0 <= n <= 31.
 		v1 = (v << n) >> 32;		// Normalize divisor, MSB is 1.
 		u1 = u >> 1;			// To ensure no overflow.
 		q1 = __div_u64(u1, v1);		// Get quotient from
 		q0 = (q1 << n) >> 31;		// Undo normalization and
 						// division of u by 2.
 		if (q0 != 0)			// Make q0 correct or
 			q0 = q0 - 1;		// too small by 1.
 		if ((u - q0 * v) >= v)
 			q0 = q0 + 1;		// Now q0 is correct.
 
 		return (q0);
 	}
 }
 EXPORT_SYMBOL(__udivdi3);
 
 #ifndef abs64
 /* CSTYLED */
 #define	abs64(x)	({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
 #endif
 
 /*
  * Implementation of 64-bit signed division for 32-bit machines.
  */
 int64_t
 __divdi3(int64_t u, int64_t v)
 {
 	int64_t q, t;
 	q = __udivdi3(abs64(u), abs64(v));
 	t = (u ^ v) >> 63;	// If u, v have different
 	return ((q ^ t) - t);	// signs, negate q.
 }
 EXPORT_SYMBOL(__divdi3);
 
 /*
  * Implementation of 64-bit unsigned modulo for 32-bit machines.
  */
 uint64_t
 __umoddi3(uint64_t dividend, uint64_t divisor)
 {
 	return (dividend - (divisor * __udivdi3(dividend, divisor)));
 }
 EXPORT_SYMBOL(__umoddi3);
 
 /* 64-bit signed modulo for 32-bit machines. */
 int64_t
 __moddi3(int64_t n, int64_t d)
 {
 	int64_t q;
 	boolean_t nn = B_FALSE;
 
 	if (n < 0) {
 		nn = B_TRUE;
 		n = -n;
 	}
 	if (d < 0)
 		d = -d;
 
 	q = __umoddi3(n, d);
 
 	return (nn ? -q : q);
 }
 EXPORT_SYMBOL(__moddi3);
 
 /*
  * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
  */
 uint64_t
 __udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
 {
 	uint64_t q = __udivdi3(n, d);
 	if (r)
 		*r = n - d * q;
 	return (q);
 }
 EXPORT_SYMBOL(__udivmoddi4);
 
 /*
  * Implementation of 64-bit signed division/modulo for 32-bit machines.
  */
 int64_t
 __divmoddi4(int64_t n, int64_t d, int64_t *r)
 {
 	int64_t q, rr;
 	boolean_t nn = B_FALSE;
 	boolean_t nd = B_FALSE;
 	if (n < 0) {
 		nn = B_TRUE;
 		n = -n;
 	}
 	if (d < 0) {
 		nd = B_TRUE;
 		d = -d;
 	}
 
 	q = __udivmoddi4(n, d, (uint64_t *)&rr);
 
 	if (nn != nd)
 		q = -q;
 	if (nn)
 		rr = -rr;
 	if (r)
 		*r = rr;
 	return (q);
 }
 EXPORT_SYMBOL(__divmoddi4);
 
 #if defined(__arm) || defined(__arm__)
 /*
  * Implementation of 64-bit (un)signed division for 32-bit arm machines.
  *
  * Run-time ABI for the ARM Architecture (page 20).  A pair of (unsigned)
  * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
  * and the remainder in {r2, r3}.  The return type is specifically left
  * set to 'void' to ensure the compiler does not overwrite these registers
  * during the return.  All results are in registers as per ABI
  */
 void
 __aeabi_uldivmod(uint64_t u, uint64_t v)
 {
 	uint64_t res;
 	uint64_t mod;
 
 	res = __udivdi3(u, v);
 	mod = __umoddi3(u, v);
 	{
 		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
 		register uint32_t r1 asm("r1") = (res >> 32);
 		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
 		register uint32_t r3 asm("r3") = (mod >> 32);
 
 		asm volatile(""
 		    : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3)  /* output */
 		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));    /* input */
 
 		return; /* r0; */
 	}
 }
 EXPORT_SYMBOL(__aeabi_uldivmod);
 
 void
 __aeabi_ldivmod(int64_t u, int64_t v)
 {
 	int64_t res;
 	uint64_t mod;
 
 	res =  __divdi3(u, v);
 	mod = __umoddi3(u, v);
 	{
 		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
 		register uint32_t r1 asm("r1") = (res >> 32);
 		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
 		register uint32_t r3 asm("r3") = (mod >> 32);
 
 		asm volatile(""
 		    : "+r"(r0), "+r"(r1), "+r"(r2), "+r"(r3)  /* output */
 		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));    /* input */
 
 		return; /* r0; */
 	}
 }
 EXPORT_SYMBOL(__aeabi_ldivmod);
 #endif /* __arm || __arm__ */
 
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 
 #endif /* BITS_PER_LONG */
 
 /*
  * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
  * ddi_strtol(9F) man page.  I have not verified the behavior of these
  * functions against their Solaris counterparts.  It is possible that I
  * may have misinterpreted the man page or the man page is incorrect.
  */
 int ddi_strtol(const char *, char **, int, long *);
 int ddi_strtoull(const char *, char **, int, unsigned long long *);
 int ddi_strtoll(const char *, char **, int, long long *);
 
 #define	define_ddi_strtox(type, valtype)				\
 int ddi_strto##type(const char *str, char **endptr,			\
     int base, valtype *result)						\
 {									\
 	valtype last_value, value = 0;					\
 	char *ptr = (char *)str;					\
 	int digit, minus = 0;						\
 									\
 	while (strchr(" \t\n\r\f", *ptr))				\
 		++ptr;							\
 									\
 	if (strlen(ptr) == 0)						\
 		return (EINVAL);					\
 									\
 	switch (*ptr) {							\
 	case '-':							\
 		minus = 1;						\
 		zfs_fallthrough;					\
 	case '+':							\
 		++ptr;							\
 		break;							\
 	}								\
 									\
 	/* Auto-detect base based on prefix */				\
 	if (!base) {							\
 		if (str[0] == '0') {					\
 			if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
 				base = 16; /* hex */			\
 				ptr += 2;				\
 			} else if (str[1] >= '0' && str[1] < '8') {	\
 				base = 8; /* octal */			\
 				ptr += 1;				\
 			} else {					\
 				return (EINVAL);			\
 			}						\
 		} else {						\
 			base = 10; /* decimal */			\
 		}							\
 	}								\
 									\
 	while (1) {							\
 		if (isdigit(*ptr))					\
 			digit = *ptr - '0';				\
 		else if (isalpha(*ptr))					\
 			digit = tolower(*ptr) - 'a' + 10;		\
 		else							\
 			break;						\
 									\
 		if (digit >= base)					\
 			break;						\
 									\
 		last_value = value;					\
 		value = value * base + digit;				\
 		if (last_value > value) /* Overflow */			\
 			return (ERANGE);				\
 									\
 		ptr++;							\
 	}								\
 									\
 	*result = minus ? -value : value;				\
 									\
 	if (endptr)							\
 		*endptr = ptr;						\
 									\
 	return (0);							\
 }									\
 
 define_ddi_strtox(l, long)
 define_ddi_strtox(ull, unsigned long long)
 define_ddi_strtox(ll, long long)
 
 EXPORT_SYMBOL(ddi_strtol);
 EXPORT_SYMBOL(ddi_strtoll);
 EXPORT_SYMBOL(ddi_strtoull);
 
 int
 ddi_copyin(const void *from, void *to, size_t len, int flags)
 {
 	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
 	if (flags & FKIOCTL) {
 		memcpy(to, from, len);
 		return (0);
 	}
 
 	return (copyin(from, to, len));
 }
 EXPORT_SYMBOL(ddi_copyin);
 
 #define	define_spl_param(type, fmt)					\
 int									\
 spl_param_get_##type(char *buf, zfs_kernel_param_t *kp)			\
 {									\
 	return (scnprintf(buf, PAGE_SIZE, fmt "\n",			\
 	    *(type *)kp->arg));						\
 }									\
 int									\
 spl_param_set_##type(const char *buf, zfs_kernel_param_t *kp)		\
 {									\
 	return (kstrto##type(buf, 0, (type *)kp->arg));			\
 }									\
 const struct kernel_param_ops spl_param_ops_##type = {			\
 	.set = spl_param_set_##type,					\
 	.get = spl_param_get_##type,					\
 };									\
 EXPORT_SYMBOL(spl_param_get_##type);					\
 EXPORT_SYMBOL(spl_param_set_##type);					\
 EXPORT_SYMBOL(spl_param_ops_##type);
 
 define_spl_param(s64, "%lld")
 define_spl_param(u64, "%llu")
 
 /*
  * Post a uevent to userspace whenever a new vdev adds to the pool. It is
  * necessary to sync blkid information with udev, which zed daemon uses
  * during device hotplug to identify the vdev.
  */
 void
 spl_signal_kobj_evt(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV)
 #ifdef HAVE_BDEV_KOBJ
 	struct kobject *disk_kobj = bdev_kobj(bdev);
 #else
 	struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj;
 #endif
 	if (disk_kobj) {
 		int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE);
 		if (ret) {
 			pr_warn("ZFS: Sending event '%d' to kobject: '%s'"
 			    " (%p): failed(ret:%d)\n", KOBJ_CHANGE,
 			    kobject_name(disk_kobj), disk_kobj, ret);
 		}
 	}
 #else
 /*
  * This is encountered if neither bdev_kobj() nor part_to_dev() is available
  * in the kernel - likely due to an API change that needs to be chased down.
  */
 #error "Unsupported kernel: unable to get struct kobj from bdev"
 #endif
 }
 EXPORT_SYMBOL(spl_signal_kobj_evt);
 
 int
 ddi_copyout(const void *from, void *to, size_t len, int flags)
 {
 	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
 	if (flags & FKIOCTL) {
 		memcpy(to, from, len);
 		return (0);
 	}
 
 	return (copyout(from, to, len));
 }
 EXPORT_SYMBOL(ddi_copyout);
 
 static int
 spl_getattr(struct file *filp, struct kstat *stat)
 {
 	int rc;
 
 	ASSERT(filp);
 	ASSERT(stat);
 
 	rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS,
 	    AT_STATX_SYNC_AS_STAT);
 	if (rc)
 		return (-rc);
 
 	return (0);
 }
 
 /*
  * Read the unique system identifier from the /etc/hostid file.
  *
  * The behavior of /usr/bin/hostid on Linux systems with the
  * regular eglibc and coreutils is:
  *
  *   1. Generate the value if the /etc/hostid file does not exist
  *      or if the /etc/hostid file is less than four bytes in size.
  *
  *   2. If the /etc/hostid file is at least 4 bytes, then return
  *      the first four bytes [0..3] in native endian order.
  *
  *   3. Always ignore bytes [4..] if they exist in the file.
  *
  * Only the first four bytes are significant, even on systems that
  * have a 64-bit word size.
  *
  * See:
  *
  *   eglibc: sysdeps/unix/sysv/linux/gethostid.c
  *   coreutils: src/hostid.c
  *
  * Notes:
  *
  * The /etc/hostid file on Solaris is a text file that often reads:
  *
  *   # DO NOT EDIT
  *   "0123456789"
  *
  * Directly copying this file to Linux results in a constant
  * hostid of 4f442023 because the default comment constitutes
  * the first four bytes of the file.
  *
  */
 
 static char *spl_hostid_path = HW_HOSTID_PATH;
 module_param(spl_hostid_path, charp, 0444);
 MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
 
 static int
 hostid_read(uint32_t *hostid)
 {
 	uint64_t size;
 	uint32_t value = 0;
 	int error;
 	loff_t off;
 	struct file *filp;
 	struct kstat stat;
 
 	filp = filp_open(spl_hostid_path, 0, 0);
 
 	if (IS_ERR(filp))
 		return (ENOENT);
 
 	error = spl_getattr(filp, &stat);
 	if (error) {
 		filp_close(filp, 0);
 		return (error);
 	}
 	size = stat.size;
 	// cppcheck-suppress sizeofwithnumericparameter
 	if (size < sizeof (HW_HOSTID_MASK)) {
 		filp_close(filp, 0);
 		return (EINVAL);
 	}
 
 	off = 0;
 	/*
 	 * Read directly into the variable like eglibc does.
 	 * Short reads are okay; native behavior is preserved.
 	 */
 	error = kernel_read(filp, &value, sizeof (value), &off);
 	if (error < 0) {
 		filp_close(filp, 0);
 		return (EIO);
 	}
 
 	/* Mask down to 32 bits like coreutils does. */
 	*hostid = (value & HW_HOSTID_MASK);
 	filp_close(filp, 0);
 
 	return (0);
 }
 
 /*
  * Return the system hostid.  Preferentially use the spl_hostid module option
  * when set, otherwise use the value in the /etc/hostid file.
  */
 uint32_t
 zone_get_hostid(void *zone)
 {
 	uint32_t hostid;
 
 	ASSERT3P(zone, ==, NULL);
 
 	if (spl_hostid != 0)
 		return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
 
 	if (hostid_read(&hostid) == 0)
 		return (hostid);
 
 	return (0);
 }
 EXPORT_SYMBOL(zone_get_hostid);
 
 static int
 spl_kvmem_init(void)
 {
 	int rc = 0;
 
 	rc = spl_kmem_init();
 	if (rc)
 		return (rc);
 
 	rc = spl_vmem_init();
 	if (rc) {
 		spl_kmem_fini();
 		return (rc);
 	}
 
 	return (rc);
 }
 
 /*
  * We initialize the random number generator with 128 bits of entropy from the
  * system random number generator. In the improbable case that we have a zero
  * seed, we fallback to the system jiffies, unless it is also zero, in which
  * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
  * initialize each of the per-cpu seeds so that the sequences generated on each
  * CPU are guaranteed to never overlap in practice.
  */
 static int __init
 spl_random_init(void)
 {
 	uint64_t s[4];
 	int i = 0;
 
 	spl_pseudo_entropy = __alloc_percpu(4 * sizeof (uint64_t),
 	    sizeof (uint64_t));
 
 	if (!spl_pseudo_entropy)
 		return (-ENOMEM);
 
 	get_random_bytes(s, sizeof (s));
 
 	if (s[0] == 0 && s[1] == 0 && s[2] == 0 && s[3] == 0) {
 		if (jiffies != 0) {
 			s[0] = jiffies;
 			s[1] = ~0 - jiffies;
 			s[2] = ~jiffies;
 			s[3] = jiffies - ~0;
 		} else {
 			(void) memcpy(s, "improbable seed", 16);
 		}
 		printk("SPL: get_random_bytes() returned 0 "
 		    "when generating random seed. Setting initial seed to "
 		    "0x%016llx%016llx%016llx%016llx.\n", cpu_to_be64(s[0]),
 		    cpu_to_be64(s[1]), cpu_to_be64(s[2]), cpu_to_be64(s[3]));
 	}
 
 	for_each_possible_cpu(i) {
 		uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i);
 
 		spl_rand_jump(s);
 
 		wordp[0] = s[0];
 		wordp[1] = s[1];
 		wordp[2] = s[2];
 		wordp[3] = s[3];
 	}
 
 	return (0);
 }
 
 static void
 spl_random_fini(void)
 {
 	free_percpu(spl_pseudo_entropy);
 }
 
 static void
 spl_kvmem_fini(void)
 {
 	spl_vmem_fini();
 	spl_kmem_fini();
 }
 
 static int __init
 spl_init(void)
 {
 	int rc = 0;
 
 	if ((rc = spl_random_init()))
 		goto out0;
 
 	if ((rc = spl_kvmem_init()))
 		goto out1;
 
 	if ((rc = spl_tsd_init()))
 		goto out2;
 
 	if ((rc = spl_proc_init()))
 		goto out3;
 
 	if ((rc = spl_kstat_init()))
 		goto out4;
 
 	if ((rc = spl_taskq_init()))
 		goto out5;
 
 	if ((rc = spl_kmem_cache_init()))
 		goto out6;
 
 	if ((rc = spl_zlib_init()))
 		goto out7;
 
 	if ((rc = spl_zone_init()))
 		goto out8;
 
 	return (rc);
 
 out8:
 	spl_zlib_fini();
 out7:
 	spl_kmem_cache_fini();
 out6:
 	spl_taskq_fini();
 out5:
 	spl_kstat_fini();
 out4:
 	spl_proc_fini();
 out3:
 	spl_tsd_fini();
 out2:
 	spl_kvmem_fini();
 out1:
 	spl_random_fini();
 out0:
 	return (rc);
 }
 
 static void __exit
 spl_fini(void)
 {
 	spl_zone_fini();
 	spl_zlib_fini();
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
 	spl_kstat_fini();
 	spl_proc_fini();
 	spl_tsd_fini();
 	spl_kvmem_fini();
 	spl_random_fini();
 }
 
 module_init(spl_init);
 module_exit(spl_fini);
 
 MODULE_DESCRIPTION("Solaris Porting Layer");
 MODULE_AUTHOR(ZFS_META_AUTHOR);
 MODULE_LICENSE("GPL");
 MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c
index 7e806bd5699c..33c7d0879741 100644
--- a/module/os/linux/spl/spl-kmem-cache.c
+++ b/module/os/linux/spl/spl-kmem-cache.c
@@ -1,1446 +1,1444 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #define	SPL_KMEM_CACHE_IMPLEMENTING
 
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/taskq.h>
 #include <sys/timer.h>
 #include <sys/vmem.h>
 #include <sys/wait.h>
 #include <sys/string.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/prefetch.h>
 
 /*
  * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
  * with smp_mb__{before,after}_atomic() because they were redundant. This is
  * only used inside our SLAB allocator, so we implement an internal wrapper
  * here to give us smp_mb__{before,after}_atomic() on older kernels.
  */
 #ifndef smp_mb__before_atomic
 #define	smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
 #endif
 
 #ifndef smp_mb__after_atomic
 #define	smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
 #endif
 
-/* BEGIN CSTYLED */
 /*
  * Cache magazines are an optimization designed to minimize the cost of
  * allocating memory.  They do this by keeping a per-cpu cache of recently
  * freed objects, which can then be reallocated without taking a lock. This
  * can improve performance on highly contended caches.  However, because
  * objects in magazines will prevent otherwise empty slabs from being
  * immediately released this may not be ideal for low memory machines.
  *
  * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
  * magazine size.  When this value is set to 0 the magazine size will be
  * automatically determined based on the object size.  Otherwise magazines
  * will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
  * may never be entirely disabled in this implementation.
  */
 static unsigned int spl_kmem_cache_magazine_size = 0;
 module_param(spl_kmem_cache_magazine_size, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
 	"Default magazine size (2-256), set automatically (0)");
 
 static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
 
 static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
 module_param(spl_kmem_cache_max_size, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
 
 /*
  * For small objects the Linux slab allocator should be used to make the most
  * efficient use of the memory.  However, large objects are not supported by
  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  * of 16K was determined to be optimal for architectures using 4K pages and
  * to also work well on architecutres using larger 64K page sizes.
  */
 static unsigned int spl_kmem_cache_slab_limit =
     SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE;
 module_param(spl_kmem_cache_slab_limit, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
 	"Objects less than N bytes use the Linux slab");
 
 /*
  * The number of threads available to allocate new slabs for caches.  This
  * should not need to be tuned but it is available for performance analysis.
  */
 static unsigned int spl_kmem_cache_kmem_threads = 4;
 module_param(spl_kmem_cache_kmem_threads, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
 	"Number of spl_kmem_cache threads");
-/* END CSTYLED */
 
 /*
  * Slab allocation interfaces
  *
  * While the Linux slab implementation was inspired by the Solaris
  * implementation I cannot use it to emulate the Solaris APIs.  I
  * require two features which are not provided by the Linux slab.
  *
  * 1) Constructors AND destructors.  Recent versions of the Linux
  *    kernel have removed support for destructors.  This is a deal
  *    breaker for the SPL which contains particularly expensive
  *    initializers for mutex's, condition variables, etc.  We also
  *    require a minimal level of cleanup for these data types unlike
  *    many Linux data types which do need to be explicitly destroyed.
  *
  * 2) Virtual address space backed slab.  Callers of the Solaris slab
  *    expect it to work well for both small are very large allocations.
  *    Because of memory fragmentation the Linux slab which is backed
  *    by kmalloc'ed memory performs very badly when confronted with
  *    large numbers of large allocations.  Basing the slab on the
  *    virtual address space removes the need for contiguous pages
  *    and greatly improve performance for large allocations.
  *
  * For these reasons, the SPL has its own slab implementation with
  * the needed features.  It is not as highly optimized as either the
  * Solaris or Linux slabs, but it should get me most of what is
  * needed until it can be optimized or obsoleted by another approach.
  *
  * One serious concern I do have about this method is the relatively
  * small virtual address space on 32bit arches.  This will seriously
  * constrain the size of the slab caches and their performance.
  */
 
 struct list_head spl_kmem_cache_list;   /* List of caches */
 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 static taskq_t *spl_kmem_cache_taskq;   /* Task queue for aging / reclaim */
 
 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 
 static void *
 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 {
 	gfp_t lflags = kmem_flags_convert(flags);
 	void *ptr;
 
 	if (skc->skc_flags & KMC_RECLAIMABLE)
 		lflags |= __GFP_RECLAIMABLE;
 	ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
 
 	/* Resulting allocated memory will be page aligned */
 	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 
 	return (ptr);
 }
 
 static void
 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 {
 	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 
 	/*
 	 * The Linux direct reclaim path uses this out of band value to
 	 * determine if forward progress is being made.  Normally this is
 	 * incremented by kmem_freepages() which is part of the various
 	 * Linux slab implementations.  However, since we are using none
 	 * of that infrastructure we are responsible for incrementing it.
 	 */
 	if (current->reclaim_state)
 #ifdef	HAVE_RECLAIM_STATE_RECLAIMED
 		current->reclaim_state->reclaimed += size >> PAGE_SHIFT;
 #else
 		current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 #endif
 	vfree(ptr);
 }
 
 /*
  * Required space for each aligned sks.
  */
 static inline uint32_t
 spl_sks_size(spl_kmem_cache_t *skc)
 {
 	return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
 	    skc->skc_obj_align, uint32_t));
 }
 
 /*
  * Required space for each aligned object.
  */
 static inline uint32_t
 spl_obj_size(spl_kmem_cache_t *skc)
 {
 	uint32_t align = skc->skc_obj_align;
 
 	return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 	    P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
 }
 
 uint64_t
 spl_kmem_cache_inuse(kmem_cache_t *cache)
 {
 	return (cache->skc_obj_total);
 }
 EXPORT_SYMBOL(spl_kmem_cache_inuse);
 
 uint64_t
 spl_kmem_cache_entry_size(kmem_cache_t *cache)
 {
 	return (cache->skc_obj_size);
 }
 EXPORT_SYMBOL(spl_kmem_cache_entry_size);
 
 /*
  * Lookup the spl_kmem_object_t for an object given that object.
  */
 static inline spl_kmem_obj_t *
 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 {
 	return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 	    skc->skc_obj_align, uint32_t));
 }
 
 /*
  * It's important that we pack the spl_kmem_obj_t structure and the
  * actual objects in to one large address space to minimize the number
  * of calls to the allocator.  It is far better to do a few large
  * allocations and then subdivide it ourselves.  Now which allocator
  * we use requires balancing a few trade offs.
  *
  * For small objects we use kmem_alloc() because as long as you are
  * only requesting a small number of pages (ideally just one) its cheap.
  * However, when you start requesting multiple pages with kmem_alloc()
  * it gets increasingly expensive since it requires contiguous pages.
  * For this reason we shift to vmem_alloc() for slabs of large objects
  * which removes the need for contiguous pages.  We do not use
  * vmem_alloc() in all cases because there is significant locking
  * overhead in __get_vm_area_node().  This function takes a single
  * global lock when acquiring an available virtual address range which
  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
  * different allocation functions for small and large objects should
  * give us the best of both worlds.
  *
  * +------------------------+
  * | spl_kmem_slab_t --+-+  |
  * | skc_obj_size    <-+ |  |
  * | spl_kmem_obj_t      |  |
  * | skc_obj_size    <---+  |
  * | spl_kmem_obj_t      |  |
  * | ...                 v  |
  * +------------------------+
  */
 static spl_kmem_slab_t *
 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_slab_t *sks;
 	void *base;
 	uint32_t obj_size;
 
 	base = kv_alloc(skc, skc->skc_slab_size, flags);
 	if (base == NULL)
 		return (NULL);
 
 	sks = (spl_kmem_slab_t *)base;
 	sks->sks_magic = SKS_MAGIC;
 	sks->sks_objs = skc->skc_slab_objs;
 	sks->sks_age = jiffies;
 	sks->sks_cache = skc;
 	INIT_LIST_HEAD(&sks->sks_list);
 	INIT_LIST_HEAD(&sks->sks_free_list);
 	sks->sks_ref = 0;
 	obj_size = spl_obj_size(skc);
 
 	for (int i = 0; i < sks->sks_objs; i++) {
 		void *obj = base + spl_sks_size(skc) + (i * obj_size);
 
 		ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 		spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
 		sko->sko_addr = obj;
 		sko->sko_magic = SKO_MAGIC;
 		sko->sko_slab = sks;
 		INIT_LIST_HEAD(&sko->sko_list);
 		list_add_tail(&sko->sko_list, &sks->sks_free_list);
 	}
 
 	return (sks);
 }
 
 /*
  * Remove a slab from complete or partial list, it must be called with
  * the 'skc->skc_lock' held but the actual free must be performed
  * outside the lock to prevent deadlocking on vmem addresses.
  */
 static void
 spl_slab_free(spl_kmem_slab_t *sks,
     struct list_head *sks_list, struct list_head *sko_list)
 {
 	spl_kmem_cache_t *skc;
 
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 	ASSERT(sks->sks_ref == 0);
 
 	skc = sks->sks_cache;
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 
 	/*
 	 * Update slab/objects counters in the cache, then remove the
 	 * slab from the skc->skc_partial_list.  Finally add the slab
 	 * and all its objects in to the private work lists where the
 	 * destructors will be called and the memory freed to the system.
 	 */
 	skc->skc_obj_total -= sks->sks_objs;
 	skc->skc_slab_total--;
 	list_del(&sks->sks_list);
 	list_add(&sks->sks_list, sks_list);
 	list_splice_init(&sks->sks_free_list, sko_list);
 }
 
 /*
  * Reclaim empty slabs at the end of the partial list.
  */
 static void
 spl_slab_reclaim(spl_kmem_cache_t *skc)
 {
 	spl_kmem_slab_t *sks = NULL, *m = NULL;
 	spl_kmem_obj_t *sko = NULL, *n = NULL;
 	LIST_HEAD(sks_list);
 	LIST_HEAD(sko_list);
 
 	/*
 	 * Empty slabs and objects must be moved to a private list so they
 	 * can be safely freed outside the spin lock.  All empty slabs are
 	 * at the end of skc->skc_partial_list, therefore once a non-empty
 	 * slab is found we can stop scanning.
 	 */
 	spin_lock(&skc->skc_lock);
 	list_for_each_entry_safe_reverse(sks, m,
 	    &skc->skc_partial_list, sks_list) {
 
 		if (sks->sks_ref > 0)
 			break;
 
 		spl_slab_free(sks, &sks_list, &sko_list);
 	}
 	spin_unlock(&skc->skc_lock);
 
 	/*
 	 * The following two loops ensure all the object destructors are run,
 	 * and the slabs themselves are freed.  This is all done outside the
 	 * skc->skc_lock since this allows the destructor to sleep, and
 	 * allows us to perform a conditional reschedule when a freeing a
 	 * large number of objects and slabs back to the system.
 	 */
 
 	list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 		ASSERT(sko->sko_magic == SKO_MAGIC);
 	}
 
 	list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 		ASSERT(sks->sks_magic == SKS_MAGIC);
 		kv_free(skc, sks, skc->skc_slab_size);
 	}
 }
 
 static spl_kmem_emergency_t *
 spl_emergency_search(struct rb_root *root, void *obj)
 {
 	struct rb_node *node = root->rb_node;
 	spl_kmem_emergency_t *ske;
 	unsigned long address = (unsigned long)obj;
 
 	while (node) {
 		ske = container_of(node, spl_kmem_emergency_t, ske_node);
 
 		if (address < ske->ske_obj)
 			node = node->rb_left;
 		else if (address > ske->ske_obj)
 			node = node->rb_right;
 		else
 			return (ske);
 	}
 
 	return (NULL);
 }
 
 static int
 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 	spl_kmem_emergency_t *ske_tmp;
 	unsigned long address = ske->ske_obj;
 
 	while (*new) {
 		ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 
 		parent = *new;
 		if (address < ske_tmp->ske_obj)
 			new = &((*new)->rb_left);
 		else if (address > ske_tmp->ske_obj)
 			new = &((*new)->rb_right);
 		else
 			return (0);
 	}
 
 	rb_link_node(&ske->ske_node, parent, new);
 	rb_insert_color(&ske->ske_node, root);
 
 	return (1);
 }
 
 /*
  * Allocate a single emergency object and track it in a red black tree.
  */
 static int
 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 {
 	gfp_t lflags = kmem_flags_convert(flags);
 	spl_kmem_emergency_t *ske;
 	int order = get_order(skc->skc_obj_size);
 	int empty;
 
 	/* Last chance use a partial slab if one now exists */
 	spin_lock(&skc->skc_lock);
 	empty = list_empty(&skc->skc_partial_list);
 	spin_unlock(&skc->skc_lock);
 	if (!empty)
 		return (-EEXIST);
 
 	if (skc->skc_flags & KMC_RECLAIMABLE)
 		lflags |= __GFP_RECLAIMABLE;
 	ske = kmalloc(sizeof (*ske), lflags);
 	if (ske == NULL)
 		return (-ENOMEM);
 
 	ske->ske_obj = __get_free_pages(lflags, order);
 	if (ske->ske_obj == 0) {
 		kfree(ske);
 		return (-ENOMEM);
 	}
 
 	spin_lock(&skc->skc_lock);
 	empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 	if (likely(empty)) {
 		skc->skc_obj_total++;
 		skc->skc_obj_emergency++;
 		if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 			skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 	}
 	spin_unlock(&skc->skc_lock);
 
 	if (unlikely(!empty)) {
 		free_pages(ske->ske_obj, order);
 		kfree(ske);
 		return (-EINVAL);
 	}
 
 	*obj = (void *)ske->ske_obj;
 
 	return (0);
 }
 
 /*
  * Locate the passed object in the red black tree and free it.
  */
 static int
 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_emergency_t *ske;
 	int order = get_order(skc->skc_obj_size);
 
 	spin_lock(&skc->skc_lock);
 	ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 	if (ske) {
 		rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 		skc->skc_obj_emergency--;
 		skc->skc_obj_total--;
 	}
 	spin_unlock(&skc->skc_lock);
 
 	if (ske == NULL)
 		return (-ENOENT);
 
 	free_pages(ske->ske_obj, order);
 	kfree(ske);
 
 	return (0);
 }
 
 /*
  * Release objects from the per-cpu magazine back to their slab.  The flush
  * argument contains the max number of entries to remove from the magazine.
  */
 static void
 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 {
 	spin_lock(&skc->skc_lock);
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	int count = MIN(flush, skm->skm_avail);
 	for (int i = 0; i < count; i++)
 		spl_cache_shrink(skc, skm->skm_objs[i]);
 
 	skm->skm_avail -= count;
 	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 	    sizeof (void *) * skm->skm_avail);
 
 	spin_unlock(&skc->skc_lock);
 }
 
 /*
  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
  * for very small objects we may end up with more than this so as not
  * to waste space in the minimal allocation of a single page.
  */
 static int
 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 {
 	uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
 
 	sks_size = spl_sks_size(skc);
 	obj_size = spl_obj_size(skc);
 	max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 	tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
 
 	if (tgt_size <= max_size) {
 		tgt_objs = (tgt_size - sks_size) / obj_size;
 	} else {
 		tgt_objs = (max_size - sks_size) / obj_size;
 		tgt_size = (tgt_objs * obj_size) + sks_size;
 	}
 
 	if (tgt_objs == 0)
 		return (-ENOSPC);
 
 	*objs = tgt_objs;
 	*size = tgt_size;
 
 	return (0);
 }
 
 /*
  * Make a guess at reasonable per-cpu magazine size based on the size of
  * each object and the cost of caching N of them in each magazine.  Long
  * term this should really adapt based on an observed usage heuristic.
  */
 static int
 spl_magazine_size(spl_kmem_cache_t *skc)
 {
 	uint32_t obj_size = spl_obj_size(skc);
 	int size;
 
 	if (spl_kmem_cache_magazine_size > 0)
 		return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
 
 	/* Per-magazine sizes below assume a 4Kib page size */
 	if (obj_size > (PAGE_SIZE * 256))
 		size = 4;  /* Minimum 4Mib per-magazine */
 	else if (obj_size > (PAGE_SIZE * 32))
 		size = 16; /* Minimum 2Mib per-magazine */
 	else if (obj_size > (PAGE_SIZE))
 		size = 64; /* Minimum 256Kib per-magazine */
 	else if (obj_size > (PAGE_SIZE / 4))
 		size = 128; /* Minimum 128Kib per-magazine */
 	else
 		size = 256;
 
 	return (size);
 }
 
 /*
  * Allocate a per-cpu magazine to associate with a specific core.
  */
 static spl_kmem_magazine_t *
 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 {
 	spl_kmem_magazine_t *skm;
 	int size = sizeof (spl_kmem_magazine_t) +
 	    sizeof (void *) * skc->skc_mag_size;
 
 	skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
 	if (skm) {
 		skm->skm_magic = SKM_MAGIC;
 		skm->skm_avail = 0;
 		skm->skm_size = skc->skc_mag_size;
 		skm->skm_refill = skc->skc_mag_refill;
 		skm->skm_cache = skc;
 		skm->skm_cpu = cpu;
 	}
 
 	return (skm);
 }
 
 /*
  * Free a per-cpu magazine associated with a specific core.
  */
 static void
 spl_magazine_free(spl_kmem_magazine_t *skm)
 {
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 	ASSERT(skm->skm_avail == 0);
 	kfree(skm);
 }
 
 /*
  * Create all pre-cpu magazines of reasonable sizes.
  */
 static int
 spl_magazine_create(spl_kmem_cache_t *skc)
 {
 	int i = 0;
 
 	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
 
 	skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
 	    num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
 	skc->skc_mag_size = spl_magazine_size(skc);
 	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 
 	for_each_possible_cpu(i) {
 		skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 		if (!skc->skc_mag[i]) {
 			for (i--; i >= 0; i--)
 				spl_magazine_free(skc->skc_mag[i]);
 
 			kfree(skc->skc_mag);
 			return (-ENOMEM);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroy all pre-cpu magazines.
  */
 static void
 spl_magazine_destroy(spl_kmem_cache_t *skc)
 {
 	spl_kmem_magazine_t *skm;
 	int i = 0;
 
 	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
 
 	for_each_possible_cpu(i) {
 		skm = skc->skc_mag[i];
 		spl_cache_flush(skc, skm, skm->skm_avail);
 		spl_magazine_free(skm);
 	}
 
 	kfree(skc->skc_mag);
 }
 
 /*
  * Create a object cache based on the following arguments:
  * name		cache name
  * size		cache object size
  * align	cache object alignment
  * ctor		cache object constructor
  * dtor		cache object destructor
  * reclaim	cache object reclaim
  * priv		cache private data for ctor/dtor/reclaim
  * vmp		unused must be NULL
  * flags
  *	KMC_KVMEM       Force kvmem backed SPL cache
  *	KMC_SLAB        Force Linux slab backed cache
  *	KMC_NODEBUG	Disable debugging (unsupported)
  *	KMC_RECLAIMABLE	Memory can be freed under pressure
  */
 spl_kmem_cache_t *
 spl_kmem_cache_create(const char *name, size_t size, size_t align,
     spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
     void *priv, void *vmp, int flags)
 {
 	gfp_t lflags = kmem_flags_convert(KM_SLEEP);
 	spl_kmem_cache_t *skc;
 	int rc;
 
 	/*
 	 * Unsupported flags
 	 */
 	ASSERT(vmp == NULL);
 	ASSERT(reclaim == NULL);
 
 	might_sleep();
 
 	skc = kzalloc(sizeof (*skc), lflags);
 	if (skc == NULL)
 		return (NULL);
 
 	skc->skc_magic = SKC_MAGIC;
 	skc->skc_name_size = strlen(name) + 1;
 	skc->skc_name = kmalloc(skc->skc_name_size, lflags);
 	if (skc->skc_name == NULL) {
 		kfree(skc);
 		return (NULL);
 	}
 	strlcpy(skc->skc_name, name, skc->skc_name_size);
 
 	skc->skc_ctor = ctor;
 	skc->skc_dtor = dtor;
 	skc->skc_private = priv;
 	skc->skc_vmp = vmp;
 	skc->skc_linux_cache = NULL;
 	skc->skc_flags = flags;
 	skc->skc_obj_size = size;
 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 	atomic_set(&skc->skc_ref, 0);
 
 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
 	INIT_LIST_HEAD(&skc->skc_partial_list);
 	skc->skc_emergency_tree = RB_ROOT;
 	spin_lock_init(&skc->skc_lock);
 	init_waitqueue_head(&skc->skc_waitq);
 	skc->skc_slab_fail = 0;
 	skc->skc_slab_create = 0;
 	skc->skc_slab_destroy = 0;
 	skc->skc_slab_total = 0;
 	skc->skc_slab_alloc = 0;
 	skc->skc_slab_max = 0;
 	skc->skc_obj_total = 0;
 	skc->skc_obj_alloc = 0;
 	skc->skc_obj_max = 0;
 	skc->skc_obj_deadlock = 0;
 	skc->skc_obj_emergency = 0;
 	skc->skc_obj_emergency_max = 0;
 
 	rc = percpu_counter_init(&skc->skc_linux_alloc, 0, GFP_KERNEL);
 	if (rc != 0) {
 		kfree(skc);
 		return (NULL);
 	}
 
 	/*
 	 * Verify the requested alignment restriction is sane.
 	 */
 	if (align) {
 		VERIFY(ISP2(align));
 		VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 		VERIFY3U(align, <=, PAGE_SIZE);
 		skc->skc_obj_align = align;
 	}
 
 	/*
 	 * When no specific type of slab is requested (kmem, vmem, or
 	 * linuxslab) then select a cache type based on the object size
 	 * and default tunables.
 	 */
 	if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {
 		if (spl_kmem_cache_slab_limit &&
 		    size <= (size_t)spl_kmem_cache_slab_limit) {
 			/*
 			 * Objects smaller than spl_kmem_cache_slab_limit can
 			 * use the Linux slab for better space-efficiency.
 			 */
 			skc->skc_flags |= KMC_SLAB;
 		} else {
 			/*
 			 * All other objects are considered large and are
 			 * placed on kvmem backed slabs.
 			 */
 			skc->skc_flags |= KMC_KVMEM;
 		}
 	}
 
 	/*
 	 * Given the type of slab allocate the required resources.
 	 */
 	if (skc->skc_flags & KMC_KVMEM) {
 		rc = spl_slab_size(skc,
 		    &skc->skc_slab_objs, &skc->skc_slab_size);
 		if (rc)
 			goto out;
 
 		rc = spl_magazine_create(skc);
 		if (rc)
 			goto out;
 	} else {
 		unsigned long slabflags = 0;
 
 		if (size > spl_kmem_cache_slab_limit)
 			goto out;
 
 		if (skc->skc_flags & KMC_RECLAIMABLE)
 			slabflags |= SLAB_RECLAIM_ACCOUNT;
 
 		skc->skc_linux_cache = kmem_cache_create_usercopy(
 		    skc->skc_name, size, align, slabflags, 0, size, NULL);
 		if (skc->skc_linux_cache == NULL)
 			goto out;
 	}
 
 	down_write(&spl_kmem_cache_sem);
 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 	up_write(&spl_kmem_cache_sem);
 
 	return (skc);
 out:
 	kfree(skc->skc_name);
 	percpu_counter_destroy(&skc->skc_linux_alloc);
 	kfree(skc);
 	return (NULL);
 }
 EXPORT_SYMBOL(spl_kmem_cache_create);
 
 /*
  * Register a move callback for cache defragmentation.
  * XXX: Unimplemented but harmless to stub out for now.
  */
 void
 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 {
 	ASSERT(move != NULL);
 }
 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 
 /*
  * Destroy a cache and all objects associated with the cache.
  */
 void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	taskqid_t id;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));
 
 	down_write(&spl_kmem_cache_sem);
 	list_del_init(&skc->skc_list);
 	up_write(&spl_kmem_cache_sem);
 
 	/* Cancel any and wait for any pending delayed tasks */
 	VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	spin_lock(&skc->skc_lock);
 	id = skc->skc_taskqid;
 	spin_unlock(&skc->skc_lock);
 
 	taskq_cancel_id(spl_kmem_cache_taskq, id);
 
 	/*
 	 * Wait until all current callers complete, this is mainly
 	 * to catch the case where a low memory situation triggers a
 	 * cache reaping action which races with this destroy.
 	 */
 	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
 
 	if (skc->skc_flags & KMC_KVMEM) {
 		spl_magazine_destroy(skc);
 		spl_slab_reclaim(skc);
 	} else {
 		ASSERT(skc->skc_flags & KMC_SLAB);
 		kmem_cache_destroy(skc->skc_linux_cache);
 	}
 
 	spin_lock(&skc->skc_lock);
 
 	/*
 	 * Validate there are no objects in use and free all the
 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
 	 */
 	ASSERT3U(skc->skc_slab_alloc, ==, 0);
 	ASSERT3U(skc->skc_obj_alloc, ==, 0);
 	ASSERT3U(skc->skc_slab_total, ==, 0);
 	ASSERT3U(skc->skc_obj_total, ==, 0);
 	ASSERT3U(skc->skc_obj_emergency, ==, 0);
 	ASSERT(list_empty(&skc->skc_complete_list));
 
 	ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
 	percpu_counter_destroy(&skc->skc_linux_alloc);
 
 	spin_unlock(&skc->skc_lock);
 
 	kfree(skc->skc_name);
 	kfree(skc);
 }
 EXPORT_SYMBOL(spl_kmem_cache_destroy);
 
 /*
  * Allocate an object from a slab attached to the cache.  This is used to
  * repopulate the per-cpu magazine caches in batches when they run low.
  */
 static void *
 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 {
 	spl_kmem_obj_t *sko;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 
 	sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
 	ASSERT(sko->sko_magic == SKO_MAGIC);
 	ASSERT(sko->sko_addr != NULL);
 
 	/* Remove from sks_free_list */
 	list_del_init(&sko->sko_list);
 
 	sks->sks_age = jiffies;
 	sks->sks_ref++;
 	skc->skc_obj_alloc++;
 
 	/* Track max obj usage statistics */
 	if (skc->skc_obj_alloc > skc->skc_obj_max)
 		skc->skc_obj_max = skc->skc_obj_alloc;
 
 	/* Track max slab usage statistics */
 	if (sks->sks_ref == 1) {
 		skc->skc_slab_alloc++;
 
 		if (skc->skc_slab_alloc > skc->skc_slab_max)
 			skc->skc_slab_max = skc->skc_slab_alloc;
 	}
 
 	return (sko->sko_addr);
 }
 
 /*
  * Generic slab allocation function to run by the global work queues.
  * It is responsible for allocating a new slab, linking it in to the list
  * of partial slabs, and then waking any waiters.
  */
 static int
 __spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_slab_t *sks;
 
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	sks = spl_slab_alloc(skc, flags);
 	spl_fstrans_unmark(cookie);
 
 	spin_lock(&skc->skc_lock);
 	if (sks) {
 		skc->skc_slab_total++;
 		skc->skc_obj_total += sks->sks_objs;
 		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 
 		smp_mb__before_atomic();
 		clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
 		smp_mb__after_atomic();
 	}
 	spin_unlock(&skc->skc_lock);
 
 	return (sks == NULL ? -ENOMEM : 0);
 }
 
 static void
 spl_cache_grow_work(void *data)
 {
 	spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
 	spl_kmem_cache_t *skc = ska->ska_cache;
 
 	int error = __spl_cache_grow(skc, ska->ska_flags);
 
 	atomic_dec(&skc->skc_ref);
 	smp_mb__before_atomic();
 	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
 	smp_mb__after_atomic();
 	if (error == 0)
 		wake_up_all(&skc->skc_waitq);
 
 	kfree(ska);
 }
 
 /*
  * Returns non-zero when a new slab should be available.
  */
 static int
 spl_cache_grow_wait(spl_kmem_cache_t *skc)
 {
 	return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
 }
 
 /*
  * No available objects on any slabs, create a new slab.  Note that this
  * functionality is disabled for KMC_SLAB caches which are backed by the
  * Linux slab.
  */
 static int
 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 {
 	int remaining, rc = 0;
 
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
 
 	*obj = NULL;
 
 	/*
 	 * Since we can't sleep attempt an emergency allocation to satisfy
 	 * the request.  The only alterative is to fail the allocation but
 	 * it's preferable try.  The use of KM_NOSLEEP is expected to be rare.
 	 */
 	if (flags & KM_NOSLEEP)
 		return (spl_emergency_alloc(skc, flags, obj));
 
 	might_sleep();
 
 	/*
 	 * Before allocating a new slab wait for any reaping to complete and
 	 * then return so the local magazine can be rechecked for new objects.
 	 */
 	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
 		rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
 		    TASK_UNINTERRUPTIBLE);
 		return (rc ? rc : -EAGAIN);
 	}
 
 	/*
 	 * Note: It would be nice to reduce the overhead of context switch
 	 * and improve NUMA locality, by trying to allocate a new slab in the
 	 * current process context with KM_NOSLEEP flag.
 	 *
 	 * However, this can't be applied to vmem/kvmem due to a bug that
 	 * spl_vmalloc() doesn't honor gfp flags in page table allocation.
 	 */
 
 	/*
 	 * This is handled by dispatching a work request to the global work
 	 * queue.  This allows us to asynchronously allocate a new slab while
 	 * retaining the ability to safely fall back to a smaller synchronous
 	 * allocations to ensure forward progress is always maintained.
 	 */
 	if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
 		spl_kmem_alloc_t *ska;
 
 		ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
 		if (ska == NULL) {
 			clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
 			smp_mb__after_atomic();
 			wake_up_all(&skc->skc_waitq);
 			return (-ENOMEM);
 		}
 
 		atomic_inc(&skc->skc_ref);
 		ska->ska_cache = skc;
 		ska->ska_flags = flags;
 		taskq_init_ent(&ska->ska_tqe);
 		taskq_dispatch_ent(spl_kmem_cache_taskq,
 		    spl_cache_grow_work, ska, 0, &ska->ska_tqe);
 	}
 
 	/*
 	 * The goal here is to only detect the rare case where a virtual slab
 	 * allocation has deadlocked.  We must be careful to minimize the use
 	 * of emergency objects which are more expensive to track.  Therefore,
 	 * we set a very long timeout for the asynchronous allocation and if
 	 * the timeout is reached the cache is flagged as deadlocked.  From
 	 * this point only new emergency objects will be allocated until the
 	 * asynchronous allocation completes and clears the deadlocked flag.
 	 */
 	if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
 		rc = spl_emergency_alloc(skc, flags, obj);
 	} else {
 		remaining = wait_event_timeout(skc->skc_waitq,
 		    spl_cache_grow_wait(skc), HZ / 10);
 
 		if (!remaining) {
 			spin_lock(&skc->skc_lock);
 			if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
 				set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
 				skc->skc_obj_deadlock++;
 			}
 			spin_unlock(&skc->skc_lock);
 		}
 
 		rc = -ENOMEM;
 	}
 
 	return (rc);
 }
 
 /*
  * Refill a per-cpu magazine with objects from the slabs for this cache.
  * Ideally the magazine can be repopulated using existing objects which have
  * been released, however if we are unable to locate enough free objects new
  * slabs of objects will be created.  On success NULL is returned, otherwise
  * the address of a single emergency object is returned for use by the caller.
  */
 static void *
 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 {
 	spl_kmem_slab_t *sks;
 	int count = 0, rc, refill;
 	void *obj = NULL;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
 	spin_lock(&skc->skc_lock);
 
 	while (refill > 0) {
 		/* No slabs available we may need to grow the cache */
 		if (list_empty(&skc->skc_partial_list)) {
 			spin_unlock(&skc->skc_lock);
 
 			local_irq_enable();
 			rc = spl_cache_grow(skc, flags, &obj);
 			local_irq_disable();
 
 			/* Emergency object for immediate use by caller */
 			if (rc == 0 && obj != NULL)
 				return (obj);
 
 			if (rc)
 				goto out;
 
 			/* Rescheduled to different CPU skm is not local */
 			if (skm != skc->skc_mag[smp_processor_id()])
 				goto out;
 
 			/*
 			 * Potentially rescheduled to the same CPU but
 			 * allocations may have occurred from this CPU while
 			 * we were sleeping so recalculate max refill.
 			 */
 			refill = MIN(refill, skm->skm_size - skm->skm_avail);
 
 			spin_lock(&skc->skc_lock);
 			continue;
 		}
 
 		/* Grab the next available slab */
 		sks = list_entry((&skc->skc_partial_list)->next,
 		    spl_kmem_slab_t, sks_list);
 		ASSERT(sks->sks_magic == SKS_MAGIC);
 		ASSERT(sks->sks_ref < sks->sks_objs);
 		ASSERT(!list_empty(&sks->sks_free_list));
 
 		/*
 		 * Consume as many objects as needed to refill the requested
 		 * cache.  We must also be careful not to overfill it.
 		 */
 		while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
 		    ++count) {
 			ASSERT(skm->skm_avail < skm->skm_size);
 			ASSERT(count < skm->skm_size);
 			skm->skm_objs[skm->skm_avail++] =
 			    spl_cache_obj(skc, sks);
 		}
 
 		/* Move slab to skc_complete_list when full */
 		if (sks->sks_ref == sks->sks_objs) {
 			list_del(&sks->sks_list);
 			list_add(&sks->sks_list, &skc->skc_complete_list);
 		}
 	}
 
 	spin_unlock(&skc->skc_lock);
 out:
 	return (NULL);
 }
 
 /*
  * Release an object back to the slab from which it came.
  */
 static void
 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_slab_t *sks = NULL;
 	spl_kmem_obj_t *sko = NULL;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 
 	sko = spl_sko_from_obj(skc, obj);
 	ASSERT(sko->sko_magic == SKO_MAGIC);
 	sks = sko->sko_slab;
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 	ASSERT(sks->sks_cache == skc);
 	list_add(&sko->sko_list, &sks->sks_free_list);
 
 	sks->sks_age = jiffies;
 	sks->sks_ref--;
 	skc->skc_obj_alloc--;
 
 	/*
 	 * Move slab to skc_partial_list when no longer full.  Slabs
 	 * are added to the head to keep the partial list is quasi-full
 	 * sorted order.  Fuller at the head, emptier at the tail.
 	 */
 	if (sks->sks_ref == (sks->sks_objs - 1)) {
 		list_del(&sks->sks_list);
 		list_add(&sks->sks_list, &skc->skc_partial_list);
 	}
 
 	/*
 	 * Move empty slabs to the end of the partial list so
 	 * they can be easily found and freed during reclamation.
 	 */
 	if (sks->sks_ref == 0) {
 		list_del(&sks->sks_list);
 		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 		skc->skc_slab_alloc--;
 	}
 }
 
 /*
  * Allocate an object from the per-cpu magazine, or if the magazine
  * is empty directly allocate from a slab and repopulate the magazine.
  */
 void *
 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_magazine_t *skm;
 	void *obj = NULL;
 
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	/*
 	 * Allocate directly from a Linux slab.  All optimizations are left
 	 * to the underlying cache we only need to guarantee that KM_SLEEP
 	 * callers will never fail.
 	 */
 	if (skc->skc_flags & KMC_SLAB) {
 		struct kmem_cache *slc = skc->skc_linux_cache;
 		do {
 			obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
 		} while ((obj == NULL) && !(flags & KM_NOSLEEP));
 
 		if (obj != NULL) {
 			/*
 			 * Even though we leave everything up to the
 			 * underlying cache we still keep track of
 			 * how many objects we've allocated in it for
 			 * better debuggability.
 			 */
 			percpu_counter_inc(&skc->skc_linux_alloc);
 		}
 		goto ret;
 	}
 
 	local_irq_disable();
 
 restart:
 	/*
 	 * Safe to update per-cpu structure without lock, but
 	 * in the restart case we must be careful to reacquire
 	 * the local magazine since this may have changed
 	 * when we need to grow the cache.
 	 */
 	skm = skc->skc_mag[smp_processor_id()];
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	if (likely(skm->skm_avail)) {
 		/* Object available in CPU cache, use it */
 		obj = skm->skm_objs[--skm->skm_avail];
 	} else {
 		obj = spl_cache_refill(skc, skm, flags);
 		if ((obj == NULL) && !(flags & KM_NOSLEEP))
 			goto restart;
 
 		local_irq_enable();
 		goto ret;
 	}
 
 	local_irq_enable();
 	ASSERT(obj);
 	ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 
 ret:
 	/* Pre-emptively migrate object to CPU L1 cache */
 	if (obj) {
 		if (obj && skc->skc_ctor)
 			skc->skc_ctor(obj, skc->skc_private, flags);
 		else
 			prefetchw(obj);
 	}
 
 	return (obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
 
 /*
  * Free an object back to the local per-cpu magazine, there is no
  * guarantee that this is the same magazine the object was originally
  * allocated from.  We may need to flush entire from the magazine
  * back to the slabs to make space.
  */
 void
 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_magazine_t *skm;
 	unsigned long flags;
 	int do_reclaim = 0;
 	int do_emergency = 0;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	/*
 	 * Run the destructor
 	 */
 	if (skc->skc_dtor)
 		skc->skc_dtor(obj, skc->skc_private);
 
 	/*
 	 * Free the object from the Linux underlying Linux slab.
 	 */
 	if (skc->skc_flags & KMC_SLAB) {
 		kmem_cache_free(skc->skc_linux_cache, obj);
 		percpu_counter_dec(&skc->skc_linux_alloc);
 		return;
 	}
 
 	/*
 	 * While a cache has outstanding emergency objects all freed objects
 	 * must be checked.  However, since emergency objects will never use
 	 * a virtual address these objects can be safely excluded as an
 	 * optimization.
 	 */
 	if (!is_vmalloc_addr(obj)) {
 		spin_lock(&skc->skc_lock);
 		do_emergency = (skc->skc_obj_emergency > 0);
 		spin_unlock(&skc->skc_lock);
 
 		if (do_emergency && (spl_emergency_free(skc, obj) == 0))
 			return;
 	}
 
 	local_irq_save(flags);
 
 	/*
 	 * Safe to update per-cpu structure without lock, but
 	 * no remote memory allocation tracking is being performed
 	 * it is entirely possible to allocate an object from one
 	 * CPU cache and return it to another.
 	 */
 	skm = skc->skc_mag[smp_processor_id()];
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	/*
 	 * Per-CPU cache full, flush it to make space for this object,
 	 * this may result in an empty slab which can be reclaimed once
 	 * interrupts are re-enabled.
 	 */
 	if (unlikely(skm->skm_avail >= skm->skm_size)) {
 		spl_cache_flush(skc, skm, skm->skm_refill);
 		do_reclaim = 1;
 	}
 
 	/* Available space in cache, use it */
 	skm->skm_objs[skm->skm_avail++] = obj;
 
 	local_irq_restore(flags);
 
 	if (do_reclaim)
 		spl_slab_reclaim(skc);
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);
 
 /*
  * Depending on how many and which objects are released it may simply
  * repopulate the local magazine which will then need to age-out.  Objects
  * which cannot fit in the magazine will be released back to their slabs
  * which will also need to age out before being released.  This is all just
  * best effort and we do not want to thrash creating and destroying slabs.
  */
 void
 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
 {
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	if (skc->skc_flags & KMC_SLAB)
 		return;
 
 	atomic_inc(&skc->skc_ref);
 
 	/*
 	 * Prevent concurrent cache reaping when contended.
 	 */
 	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
 		goto out;
 
 	/* Reclaim from the magazine and free all now empty slabs. */
 	unsigned long irq_flags;
 	local_irq_save(irq_flags);
 	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 	spl_cache_flush(skc, skm, skm->skm_avail);
 	local_irq_restore(irq_flags);
 
 	spl_slab_reclaim(skc);
 	clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
 	smp_mb__after_atomic();
 	wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
 out:
 	atomic_dec(&skc->skc_ref);
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
 
 /*
  * This is stubbed out for code consistency with other platforms.  There
  * is existing logic to prevent concurrent reaping so while this is ugly
  * it should do no harm.
  */
 int
 spl_kmem_cache_reap_active(void)
 {
 	return (0);
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_active);
 
 /*
  * Reap all free slabs from all registered caches.
  */
 void
 spl_kmem_reap(void)
 {
 	spl_kmem_cache_t *skc = NULL;
 
 	down_read(&spl_kmem_cache_sem);
 	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
 		spl_kmem_cache_reap_now(skc);
 	}
 	up_read(&spl_kmem_cache_sem);
 }
 EXPORT_SYMBOL(spl_kmem_reap);
 
 int
 spl_kmem_cache_init(void)
 {
 	init_rwsem(&spl_kmem_cache_sem);
 	INIT_LIST_HEAD(&spl_kmem_cache_list);
 	spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
 	    spl_kmem_cache_kmem_threads, maxclsyspri,
 	    spl_kmem_cache_kmem_threads * 8, INT_MAX,
 	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	if (spl_kmem_cache_taskq == NULL)
 		return (-ENOMEM);
 
 	return (0);
 }
 
 void
 spl_kmem_cache_fini(void)
 {
 	taskq_destroy(spl_kmem_cache_taskq);
 }
diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c
index cae304d33bc3..3e8361184d57 100644
--- a/module/os/linux/spl/spl-kmem.c
+++ b/module/os/linux/spl/spl-kmem.c
@@ -1,629 +1,627 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <sys/debug.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/vmem.h>
 
-/* BEGIN CSTYLED */
 /*
  * As a general rule kmem_alloc() allocations should be small, preferably
  * just a few pages since they must by physically contiguous.  Therefore, a
  * rate limited warning will be printed to the console for any kmem_alloc()
  * which exceeds a reasonable threshold.
  *
  * The default warning threshold is set to sixteen pages but capped at 64K to
  * accommodate systems using large pages.  This value was selected to be small
  * enough to ensure the largest allocations are quickly noticed and fixed.
  * But large enough to avoid logging any warnings when a allocation size is
  * larger than optimal but not a serious concern.  Since this value is tunable,
  * developers are encouraged to set it lower when testing so any new largish
  * allocations are quickly caught.  These warnings may be disabled by setting
  * the threshold to zero.
  */
 unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
 module_param(spl_kmem_alloc_warn, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_alloc_warn,
 	"Warning threshold in bytes for a kmem_alloc()");
 EXPORT_SYMBOL(spl_kmem_alloc_warn);
 
 /*
  * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
  * Allocations which are marginally smaller than this limit may succeed but
  * should still be avoided due to the expense of locating a contiguous range
  * of free pages.  Therefore, a maximum kmem size with reasonable safely
  * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
  * will quickly fail.  Vmem_alloc() allocations less than or equal to this
  * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
  */
 unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
 module_param(spl_kmem_alloc_max, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_alloc_max,
 	"Maximum size in bytes for a kmem_alloc()");
 EXPORT_SYMBOL(spl_kmem_alloc_max);
-/* END CSTYLED */
 
 int
 kmem_debugging(void)
 {
 	return (0);
 }
 EXPORT_SYMBOL(kmem_debugging);
 
 char *
 kmem_vasprintf(const char *fmt, va_list ap)
 {
 	va_list aq;
 	char *ptr;
 
 	do {
 		va_copy(aq, ap);
 		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
 		va_end(aq);
 	} while (ptr == NULL);
 
 	return (ptr);
 }
 EXPORT_SYMBOL(kmem_vasprintf);
 
 char *
 kmem_asprintf(const char *fmt, ...)
 {
 	va_list ap;
 	char *ptr;
 
 	do {
 		va_start(ap, fmt);
 		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
 		va_end(ap);
 	} while (ptr == NULL);
 
 	return (ptr);
 }
 EXPORT_SYMBOL(kmem_asprintf);
 
 static char *
 __strdup(const char *str, int flags)
 {
 	char *ptr;
 	int n;
 
 	n = strlen(str);
 	ptr = kmalloc(n + 1, kmem_flags_convert(flags));
 	if (ptr)
 		memcpy(ptr, str, n + 1);
 
 	return (ptr);
 }
 
 char *
 kmem_strdup(const char *str)
 {
 	return (__strdup(str, KM_SLEEP));
 }
 EXPORT_SYMBOL(kmem_strdup);
 
 void
 kmem_strfree(char *str)
 {
 	kfree(str);
 }
 EXPORT_SYMBOL(kmem_strfree);
 
 void *
 spl_kvmalloc(size_t size, gfp_t lflags)
 {
 	/*
 	 * GFP_KERNEL allocations can safely use kvmalloc which may
 	 * improve performance by avoiding a) high latency caused by
 	 * vmalloc's on-access allocation, b) performance loss due to
 	 * MMU memory address mapping and c) vmalloc locking overhead.
 	 * This has the side-effect that the slab statistics will
 	 * incorrectly report this as a vmem allocation, but that is
 	 * purely cosmetic.
 	 */
 	if ((lflags & GFP_KERNEL) == GFP_KERNEL)
 		return (kvmalloc(size, lflags));
 
 	gfp_t kmalloc_lflags = lflags;
 
 	if (size > PAGE_SIZE) {
 		/*
 		 * We need to set __GFP_NOWARN here since spl_kvmalloc is not
 		 * only called by spl_kmem_alloc_impl but can be called
 		 * directly with custom lflags, too. In that case
 		 * kmem_flags_convert does not get called, which would
 		 * implicitly set __GFP_NOWARN.
 		 */
 		kmalloc_lflags |= __GFP_NOWARN;
 
 		/*
 		 * N.B. __GFP_RETRY_MAYFAIL is supported only for large
 		 * e (>32kB) allocations.
 		 *
 		 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
 		 * for !costly requests because there is no other way to tell
 		 * the allocator that we want to fail rather than retry
 		 * endlessly.
 		 */
 		if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
 		    (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
 			kmalloc_lflags |= __GFP_NORETRY;
 		}
 	}
 
 	/*
 	 * We first try kmalloc - even for big sizes - and fall back to
 	 * spl_vmalloc if that fails.
 	 *
 	 * For non-__GFP-RECLAIM allocations we always stick to
 	 * kmalloc_node, and fail when kmalloc is not successful (returns
 	 * NULL).
 	 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc
 	 * internally uses GPF_KERNEL allocations.
 	 */
 	void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
 	if (ptr || size <= PAGE_SIZE ||
 	    (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
 		return (ptr);
 	}
 
 	return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
 }
 
 /*
  * General purpose unified implementation of kmem_alloc(). It is an
  * amalgamation of Linux and Illumos allocator design. It should never be
  * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
  * relatively portable.  Consumers may only access this function through
  * wrappers that enforce the common flags to ensure portability.
  */
 inline void *
 spl_kmem_alloc_impl(size_t size, int flags, int node)
 {
 	gfp_t lflags = kmem_flags_convert(flags);
 	void *ptr;
 
 	/*
 	 * Log abnormally large allocations and rate limit the console output.
 	 * Allocations larger than spl_kmem_alloc_warn should be performed
 	 * through the vmem_alloc()/vmem_zalloc() interfaces.
 	 */
 	if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
 	    !(flags & KM_VMEM)) {
 		printk(KERN_WARNING
 		    "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
 		    "https://github.com/openzfs/zfs/issues/new\n",
 		    (unsigned long)size, flags);
 		dump_stack();
 	}
 
 	/*
 	 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
 	 * unlike kmem_alloc() with KM_SLEEP on Illumos.
 	 */
 	do {
 		/*
 		 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
 		 * is unsafe.  This must fail for all for kmem_alloc() and
 		 * kmem_zalloc() callers.
 		 *
 		 * For vmem_alloc() and vmem_zalloc() callers it is permissible
 		 * to use spl_vmalloc().  However, in general use of
 		 * spl_vmalloc() is strongly discouraged because a global lock
 		 * must be acquired.  Contention on this lock can significantly
 		 * impact performance so frequently manipulating the virtual
 		 * address space is strongly discouraged.
 		 */
 		if (size > spl_kmem_alloc_max) {
 			if (flags & KM_VMEM) {
 				ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
 			} else {
 				return (NULL);
 			}
 		} else {
 			/*
 			 * We use kmalloc when doing kmem_alloc(KM_NOSLEEP),
 			 * because kvmalloc/vmalloc may sleep.  We also use
 			 * kmalloc on systems with limited kernel VA space (e.g.
 			 * 32-bit), which have HIGHMEM.  Otherwise we use
 			 * kvmalloc, which tries to get contiguous physical
 			 * memory (fast, like kmalloc) and falls back on using
 			 * virtual memory to stitch together pages (slow, like
 			 * vmalloc).
 			 */
 #ifdef CONFIG_HIGHMEM
 			if (flags & KM_VMEM) {
 #else
 			if ((flags & KM_VMEM) || !(flags & KM_NOSLEEP)) {
 #endif
 				ptr = spl_kvmalloc(size, lflags);
 			} else {
 				ptr = kmalloc_node(size, lflags, node);
 			}
 		}
 
 		if (likely(ptr) || (flags & KM_NOSLEEP))
 			return (ptr);
 
 		/*
 		 * Try hard to satisfy the allocation. However, when progress
 		 * cannot be made, the allocation is allowed to fail.
 		 */
 		if ((lflags & GFP_KERNEL) == GFP_KERNEL)
 			lflags |= __GFP_RETRY_MAYFAIL;
 
 		/*
 		 * Use cond_resched() instead of congestion_wait() to avoid
 		 * deadlocking systems where there are no block devices.
 		 */
 		cond_resched();
 	} while (1);
 
 	return (NULL);
 }
 
 inline void
 spl_kmem_free_impl(const void *buf, size_t size)
 {
 	if (is_vmalloc_addr(buf))
 		vfree(buf);
 	else
 		kfree(buf);
 }
 
 /*
  * Memory allocation and accounting for kmem_* * style allocations.  When
  * DEBUG_KMEM is enabled the total memory allocated will be tracked and
  * any memory leaked will be reported during module unload.
  *
  * ./configure --enable-debug-kmem
  */
 #ifdef DEBUG_KMEM
 
 /* Shim layer memory accounting */
 #ifdef HAVE_ATOMIC64_T
 atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
 unsigned long long kmem_alloc_max = 0;
 #else  /* HAVE_ATOMIC64_T */
 atomic_t kmem_alloc_used = ATOMIC_INIT(0);
 unsigned long long kmem_alloc_max = 0;
 #endif /* HAVE_ATOMIC64_T */
 
 EXPORT_SYMBOL(kmem_alloc_used);
 EXPORT_SYMBOL(kmem_alloc_max);
 
 inline void *
 spl_kmem_alloc_debug(size_t size, int flags, int node)
 {
 	void *ptr;
 
 	ptr = spl_kmem_alloc_impl(size, flags, node);
 	if (ptr) {
 		kmem_alloc_used_add(size);
 		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
 			kmem_alloc_max = kmem_alloc_used_read();
 	}
 
 	return (ptr);
 }
 
 inline void
 spl_kmem_free_debug(const void *ptr, size_t size)
 {
 	kmem_alloc_used_sub(size);
 	spl_kmem_free_impl(ptr, size);
 }
 
 /*
  * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
  * but also the location of every alloc and free.  When the SPL module is
  * unloaded a list of all leaked addresses and where they were allocated
  * will be dumped to the console.  Enabling this feature has a significant
  * impact on performance but it makes finding memory leaks straight forward.
  *
  * Not surprisingly with debugging enabled the xmem_locks are very highly
  * contended particularly on xfree().  If we want to run with this detailed
  * debugging enabled for anything other than debugging  we need to minimize
  * the contention by moving to a lock per xmem_table entry model.
  *
  * ./configure --enable-debug-kmem-tracking
  */
 #ifdef DEBUG_KMEM_TRACKING
 
 #include <linux/hash.h>
 #include <linux/ctype.h>
 
 #define	KMEM_HASH_BITS		10
 #define	KMEM_TABLE_SIZE		(1 << KMEM_HASH_BITS)
 
 typedef struct kmem_debug {
 	struct hlist_node kd_hlist;	/* Hash node linkage */
 	struct list_head kd_list;	/* List of all allocations */
 	void *kd_addr;			/* Allocation pointer */
 	size_t kd_size;			/* Allocation size */
 	const char *kd_func;		/* Allocation function */
 	int kd_line;			/* Allocation line */
 } kmem_debug_t;
 
 static spinlock_t kmem_lock;
 static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
 static struct list_head kmem_list;
 
 static kmem_debug_t *
 kmem_del_init(spinlock_t *lock, struct hlist_head *table,
     int bits, const void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node = NULL;
 	struct kmem_debug *p;
 	unsigned long flags;
 
 	spin_lock_irqsave(lock, flags);
 
 	head = &table[hash_ptr((void *)addr, bits)];
 	hlist_for_each(node, head) {
 		p = list_entry(node, struct kmem_debug, kd_hlist);
 		if (p->kd_addr == addr) {
 			hlist_del_init(&p->kd_hlist);
 			list_del_init(&p->kd_list);
 			spin_unlock_irqrestore(lock, flags);
 			return (p);
 		}
 	}
 
 	spin_unlock_irqrestore(lock, flags);
 
 	return (NULL);
 }
 
 inline void *
 spl_kmem_alloc_track(size_t size, int flags,
     const char *func, int line, int node)
 {
 	void *ptr = NULL;
 	kmem_debug_t *dptr;
 	unsigned long irq_flags;
 
 	dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
 	if (dptr == NULL)
 		return (NULL);
 
 	dptr->kd_func = __strdup(func, flags);
 	if (dptr->kd_func == NULL) {
 		kfree(dptr);
 		return (NULL);
 	}
 
 	ptr = spl_kmem_alloc_debug(size, flags, node);
 	if (ptr == NULL) {
 		kfree(dptr->kd_func);
 		kfree(dptr);
 		return (NULL);
 	}
 
 	INIT_HLIST_NODE(&dptr->kd_hlist);
 	INIT_LIST_HEAD(&dptr->kd_list);
 
 	dptr->kd_addr = ptr;
 	dptr->kd_size = size;
 	dptr->kd_line = line;
 
 	spin_lock_irqsave(&kmem_lock, irq_flags);
 	hlist_add_head(&dptr->kd_hlist,
 	    &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
 	list_add_tail(&dptr->kd_list, &kmem_list);
 	spin_unlock_irqrestore(&kmem_lock, irq_flags);
 
 	return (ptr);
 }
 
 inline void
 spl_kmem_free_track(const void *ptr, size_t size)
 {
 	kmem_debug_t *dptr;
 
 	/* Ignore NULL pointer since we haven't tracked it at all */
 	if (ptr == NULL)
 		return;
 
 	/* Must exist in hash due to kmem_alloc() */
 	dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
 	ASSERT3P(dptr, !=, NULL);
 	ASSERT3S(dptr->kd_size, ==, size);
 
 	kfree(dptr->kd_func);
 	kfree(dptr);
 
 	spl_kmem_free_debug(ptr, size);
 }
 #endif /* DEBUG_KMEM_TRACKING */
 #endif /* DEBUG_KMEM */
 
 /*
  * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
  */
 void *
 spl_kmem_alloc(size_t size, int flags, const char *func, int line)
 {
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 
 #if !defined(DEBUG_KMEM)
 	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 #elif !defined(DEBUG_KMEM_TRACKING)
 	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 #else
 	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 #endif
 }
 EXPORT_SYMBOL(spl_kmem_alloc);
 
 void *
 spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
 {
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 
 	flags |= KM_ZERO;
 
 #if !defined(DEBUG_KMEM)
 	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
 #elif !defined(DEBUG_KMEM_TRACKING)
 	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
 #else
 	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
 #endif
 }
 EXPORT_SYMBOL(spl_kmem_zalloc);
 
 void
 spl_kmem_free(const void *buf, size_t size)
 {
 #if !defined(DEBUG_KMEM)
 	return (spl_kmem_free_impl(buf, size));
 #elif !defined(DEBUG_KMEM_TRACKING)
 	return (spl_kmem_free_debug(buf, size));
 #else
 	return (spl_kmem_free_track(buf, size));
 #endif
 }
 EXPORT_SYMBOL(spl_kmem_free);
 
 #if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
 static char *
 spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
 {
 	int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
 	int i, flag = 1;
 
 	ASSERT(str != NULL && len >= 17);
 	memset(str, 0, len);
 
 	/*
 	 * Check for a fully printable string, and while we are at
 	 * it place the printable characters in the passed buffer.
 	 */
 	for (i = 0; i < size; i++) {
 		str[i] = ((char *)(kd->kd_addr))[i];
 		if (isprint(str[i])) {
 			continue;
 		} else {
 			/*
 			 * Minimum number of printable characters found
 			 * to make it worthwhile to print this as ascii.
 			 */
 			if (i > min)
 				break;
 
 			flag = 0;
 			break;
 		}
 	}
 
 	if (!flag) {
 		sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
 		    *((uint8_t *)kd->kd_addr),
 		    *((uint8_t *)kd->kd_addr + 2),
 		    *((uint8_t *)kd->kd_addr + 4),
 		    *((uint8_t *)kd->kd_addr + 6),
 		    *((uint8_t *)kd->kd_addr + 8),
 		    *((uint8_t *)kd->kd_addr + 10),
 		    *((uint8_t *)kd->kd_addr + 12),
 		    *((uint8_t *)kd->kd_addr + 14));
 	}
 
 	return (str);
 }
 
 static int
 spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
 {
 	int i;
 
 	spin_lock_init(lock);
 	INIT_LIST_HEAD(list);
 
 	for (i = 0; i < size; i++)
 		INIT_HLIST_HEAD(&kmem_table[i]);
 
 	return (0);
 }
 
 static void
 spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
 {
 	unsigned long flags;
 	kmem_debug_t *kd = NULL;
 	char str[17];
 
 	spin_lock_irqsave(lock, flags);
 	if (!list_empty(list))
 		printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
 		    "size", "data", "func", "line");
 
 	list_for_each_entry(kd, list, kd_list) {
 		printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
 		    (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
 		    kd->kd_func, kd->kd_line);
 	}
 
 	spin_unlock_irqrestore(lock, flags);
 }
 #endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
 
 int
 spl_kmem_init(void)
 {
 
 #ifdef DEBUG_KMEM
 	kmem_alloc_used_set(0);
 
 
 
 #ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
 #endif /* DEBUG_KMEM_TRACKING */
 #endif /* DEBUG_KMEM */
 
 	return (0);
 }
 
 void
 spl_kmem_fini(void)
 {
 #ifdef DEBUG_KMEM
 	/*
 	 * Display all unreclaimed memory addresses, including the
 	 * allocation size and the first few bytes of what's located
 	 * at that address to aid in debugging.  Performance is not
 	 * a serious concern here since it is module unload time.
 	 */
 	if (kmem_alloc_used_read() != 0)
 		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
 		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
 
 #ifdef DEBUG_KMEM_TRACKING
 	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
 #endif /* DEBUG_KMEM_TRACKING */
 #endif /* DEBUG_KMEM */
 }
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 7f4cab5da114..77dd472ea8b1 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1,1845 +1,1841 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Task Queue Implementation.
  */
 /*
  * Copyright (c) 2024, Klara Inc.
  * Copyright (c) 2024, Syneto
  */
 
 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
 #include <sys/time.h>
 #include <sys/atomic.h>
 #include <sys/kstat.h>
 #include <linux/cpuhotplug.h>
 
 typedef struct taskq_kstats {
 	/* static values, for completeness */
 	kstat_named_t tqks_threads_max;
 	kstat_named_t tqks_entry_pool_min;
 	kstat_named_t tqks_entry_pool_max;
 
 	/* gauges (inc/dec counters, current value) */
 	kstat_named_t tqks_threads_active;
 	kstat_named_t tqks_threads_idle;
 	kstat_named_t tqks_threads_total;
 	kstat_named_t tqks_tasks_pending;
 	kstat_named_t tqks_tasks_priority;
 	kstat_named_t tqks_tasks_total;
 	kstat_named_t tqks_tasks_delayed;
 	kstat_named_t tqks_entries_free;
 
 	/* counters (inc only, since taskq creation) */
 	kstat_named_t tqks_threads_created;
 	kstat_named_t tqks_threads_destroyed;
 	kstat_named_t tqks_tasks_dispatched;
 	kstat_named_t tqks_tasks_dispatched_delayed;
 	kstat_named_t tqks_tasks_executed_normal;
 	kstat_named_t tqks_tasks_executed_priority;
 	kstat_named_t tqks_tasks_executed;
 	kstat_named_t tqks_tasks_delayed_requeued;
 	kstat_named_t tqks_tasks_cancelled;
 	kstat_named_t tqks_thread_wakeups;
 	kstat_named_t tqks_thread_wakeups_nowork;
 	kstat_named_t tqks_thread_sleeps;
 } taskq_kstats_t;
 
 static taskq_kstats_t taskq_kstats_template = {
 	{ "threads_max",		KSTAT_DATA_UINT64 },
 	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
 	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
 	{ "threads_active",		KSTAT_DATA_UINT64 },
 	{ "threads_idle",		KSTAT_DATA_UINT64 },
 	{ "threads_total",		KSTAT_DATA_UINT64 },
 	{ "tasks_pending",		KSTAT_DATA_UINT64 },
 	{ "tasks_priority",		KSTAT_DATA_UINT64 },
 	{ "tasks_total",		KSTAT_DATA_UINT64 },
 	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
 	{ "entries_free",		KSTAT_DATA_UINT64 },
 
 	{ "threads_created",		KSTAT_DATA_UINT64 },
 	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
 	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
 	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed",		KSTAT_DATA_UINT64 },
 	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
 	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
 	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
 	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
 	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
 };
 
 #define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
 #define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
 
 #define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
 	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
 	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
 	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
 	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
 	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
 	}								\
 } while (0)
 #define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
 #define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
 
 #define	TQENT_SET_LIST(t, l)	\
 	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
 
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
 static uint_t spl_taskq_thread_timeout_ms = 5000;
-/* BEGIN CSTYLED */
 module_param(spl_taskq_thread_timeout_ms, uint, 0644);
-/* END CSTYLED */
 MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
 	"Minimum idle threads exit interval for dynamic taskqs");
 
 static int spl_taskq_thread_dynamic = 1;
 module_param(spl_taskq_thread_dynamic, int, 0444);
 MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
 
 static int spl_taskq_thread_priority = 1;
 module_param(spl_taskq_thread_priority, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_priority,
 	"Allow non-default priority for taskq threads");
 
 static uint_t spl_taskq_thread_sequential = 4;
-/* BEGIN CSTYLED */
 module_param(spl_taskq_thread_sequential, uint, 0644);
-/* END CSTYLED */
 MODULE_PARM_DESC(spl_taskq_thread_sequential,
 	"Create new taskq threads after N sequential tasks");
 
 /*
  * Global system-wide dynamic task queue available for all consumers. This
  * taskq is not intended for long-running tasks; instead, a dedicated taskq
  * should be created.
  */
 taskq_t *system_taskq;
 EXPORT_SYMBOL(system_taskq);
 /* Global dynamic task queue for long delay */
 taskq_t *system_delay_taskq;
 EXPORT_SYMBOL(system_delay_taskq);
 
 /* Private dedicated taskq for creating new taskq threads on demand. */
 static taskq_t *dynamic_taskq;
 static taskq_thread_t *taskq_thread_create(taskq_t *);
 
 /* Multi-callback id for cpu hotplugging. */
 static int spl_taskq_cpuhp_state;
 
 /* List of all taskqs */
 LIST_HEAD(tq_list);
 struct rw_semaphore tq_list_sem;
 static uint_t taskq_tsd;
 
 static int
 task_km_flags(uint_t flags)
 {
 	if (flags & TQ_NOSLEEP)
 		return (KM_NOSLEEP);
 
 	if (flags & TQ_PUSHPAGE)
 		return (KM_PUSHPAGE);
 
 	return (KM_SLEEP);
 }
 
 /*
  * taskq_find_by_name - Find the largest instance number of a named taskq.
  */
 static int
 taskq_find_by_name(const char *name)
 {
 	struct list_head *tql = NULL;
 	taskq_t *tq;
 
 	list_for_each_prev(tql, &tq_list) {
 		tq = list_entry(tql, taskq_t, tq_taskqs);
 		if (strcmp(name, tq->tq_name) == 0)
 			return (tq->tq_instance);
 	}
 	return (-1);
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, returns a list_t which
  * is not attached to the free, work, or pending taskq lists.
  */
 static taskq_ent_t *
 task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
 {
 	taskq_ent_t *t;
 	int count = 0;
 
 	ASSERT(tq);
 retry:
 	/* Acquire taskq_ent_t's from free list if available */
 	if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
 		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
 
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
 		ASSERT(!timer_pending(&t->tqent_timer));
 
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}
 
 	/* Free list is empty and memory allocations are prohibited */
 	if (flags & TQ_NOALLOC)
 		return (NULL);
 
 	/* Hit maximum taskq_ent_t pool size */
 	if (tq->tq_nalloc >= tq->tq_maxalloc) {
 		if (flags & TQ_NOSLEEP)
 			return (NULL);
 
 		/*
 		 * Sleep periodically polling the free list for an available
 		 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
 		 * but we cannot block forever waiting for an taskq_ent_t to
 		 * show up in the free list, otherwise a deadlock can happen.
 		 *
 		 * Therefore, we need to allocate a new task even if the number
 		 * of allocated tasks is above tq->tq_maxalloc, but we still
 		 * end up delaying the task allocation by one second, thereby
 		 * throttling the task dispatch rate.
 		 */
 		spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
 		schedule_timeout_interruptible(HZ / 100);
 		spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
 		    tq->tq_lock_class);
 		if (count < 100) {
 			count++;
 			goto retry;
 		}
 	}
 
 	spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
 	t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
 	spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
 
 	if (t) {
 		taskq_init_ent(t);
 		tq->tq_nalloc++;
 	}
 
 	return (t);
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
  * to already be removed from the free, work, or pending taskq lists.
  */
 static void
 task_free(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
 	ASSERT(list_empty(&t->tqent_list));
 	ASSERT(!timer_pending(&t->tqent_timer));
 
 	kmem_free(t, sizeof (taskq_ent_t));
 	tq->tq_nalloc--;
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, either destroys the
  * taskq_ent_t if too many exist or moves it to the free list for later use.
  */
 static void
 task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
 	ASSERT(list_empty(&t->tqent_list));
 
 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);
 
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
 		t->tqent_arg = NULL;
 		t->tqent_flags = 0;
 
 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
 		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
 }
 
 /*
  * When a delayed task timer expires remove it from the delay list and
  * add it to the priority list in order for immediate processing.
  */
 static void
 task_expire_impl(taskq_ent_t *t)
 {
 	taskq_ent_t *w;
 	taskq_t *tq = t->tqent_taskq;
 	struct list_head *l = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (t->tqent_flags & TQENT_FLAG_CANCEL) {
 		ASSERT(list_empty(&t->tqent_list));
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		return;
 	}
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	/*
 	 * The priority list must be maintained in strict task id order
 	 * from lowest to highest for lowest_id to be easily calculable.
 	 */
 	list_del(&t->tqent_list);
 	list_for_each_prev(l, &tq->tq_prio_list) {
 		w = list_entry(l, taskq_ent_t, tqent_list);
 		if (w->tqent_id < t->tqent_id) {
 			list_add(&t->tqent_list, l);
 			break;
 		}
 	}
 	if (l == &tq->tq_prio_list)
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_delayed_requeued);
 }
 
 static void
 task_expire(struct timer_list *tl)
 {
 	struct timer_list *tmr = (struct timer_list *)tl;
 	taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
 	task_expire_impl(t);
 }
 
 /*
  * Returns the lowest incomplete taskqid_t.  The taskqid_t may
  * be queued on the pending list, on the priority list, on the
  * delay list, or on the work list currently being handled, but
  * it is not 100% complete yet.
  */
 static taskqid_t
 taskq_lowest_id(taskq_t *tq)
 {
 	taskqid_t lowest_id = tq->tq_next_id;
 	taskq_ent_t *t;
 	taskq_thread_t *tqt;
 
 	if (!list_empty(&tq->tq_pend_list)) {
 		t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_prio_list)) {
 		t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_delay_list)) {
 		t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_active_list)) {
 		tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
 		    tqt_active_list);
 		ASSERT(tqt->tqt_id != TASKQID_INVALID);
 		lowest_id = MIN(lowest_id, tqt->tqt_id);
 	}
 
 	return (lowest_id);
 }
 
 /*
  * Insert a task into a list keeping the list sorted by increasing taskqid.
  */
 static void
 taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
 {
 	taskq_thread_t *w;
 	struct list_head *l = NULL;
 
 	ASSERT(tq);
 	ASSERT(tqt);
 
 	list_for_each_prev(l, &tq->tq_active_list) {
 		w = list_entry(l, taskq_thread_t, tqt_active_list);
 		if (w->tqt_id < tqt->tqt_id) {
 			list_add(&tqt->tqt_active_list, l);
 			break;
 		}
 	}
 	if (l == &tq->tq_active_list)
 		list_add(&tqt->tqt_active_list, &tq->tq_active_list);
 }
 
 /*
  * Find and return a task from the given list if it exists.  The list
  * must be in lowest to highest task id order.
  */
 static taskq_ent_t *
 taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
 {
 	struct list_head *l = NULL;
 	taskq_ent_t *t;
 
 	list_for_each(l, lh) {
 		t = list_entry(l, taskq_ent_t, tqent_list);
 
 		if (t->tqent_id == id)
 			return (t);
 
 		if (t->tqent_id > id)
 			break;
 	}
 
 	return (NULL);
 }
 
 /*
  * Find an already dispatched task given the task id regardless of what
  * state it is in.  If a task is still pending it will be returned.
  * If a task is executing, then -EBUSY will be returned instead.
  * If the task has already been run then NULL is returned.
  */
 static taskq_ent_t *
 taskq_find(taskq_t *tq, taskqid_t id)
 {
 	taskq_thread_t *tqt;
 	struct list_head *l = NULL;
 	taskq_ent_t *t;
 
 	t = taskq_find_list(tq, &tq->tq_delay_list, id);
 	if (t)
 		return (t);
 
 	t = taskq_find_list(tq, &tq->tq_prio_list, id);
 	if (t)
 		return (t);
 
 	t = taskq_find_list(tq, &tq->tq_pend_list, id);
 	if (t)
 		return (t);
 
 	list_for_each(l, &tq->tq_active_list) {
 		tqt = list_entry(l, taskq_thread_t, tqt_active_list);
 		if (tqt->tqt_id == id) {
 			/*
 			 * Instead of returning tqt_task, we just return a non
 			 * NULL value to prevent misuse, since tqt_task only
 			 * has two valid fields.
 			 */
 			return (ERR_PTR(-EBUSY));
 		}
 	}
 
 	return (NULL);
 }
 
 /*
  * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
  * taskq_wait() functions below.
  *
  * Taskq waiting is accomplished by tracking the lowest outstanding task
  * id and the next available task id.  As tasks are dispatched they are
  * added to the tail of the pending, priority, or delay lists.  As worker
  * threads become available the tasks are removed from the heads of these
  * lists and linked to the worker threads.  This ensures the lists are
  * kept sorted by lowest to highest task id.
  *
  * Therefore the lowest outstanding task id can be quickly determined by
  * checking the head item from all of these lists.  This value is stored
  * with the taskq as the lowest id.  It only needs to be recalculated when
  * either the task with the current lowest id completes or is canceled.
  *
  * By blocking until the lowest task id exceeds the passed task id the
  * taskq_wait_outstanding() function can be easily implemented.  Similarly,
  * by blocking until the lowest task id matches the next task id taskq_wait()
  * can be implemented.
  *
  * Callers should be aware that when there are multiple worked threads it
  * is possible for larger task ids to complete before smaller ones.  Also
  * when the taskq contains delay tasks with small task ids callers may
  * block for a considerable length of time waiting for them to expire and
  * execute.
  */
 static int
 taskq_wait_id_check(taskq_t *tq, taskqid_t id)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (taskq_find(tq, id) == NULL);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait_id() function blocks until the passed task id completes.
  * This does not guarantee that all lower task ids have completed.
  */
 void
 taskq_wait_id(taskq_t *tq, taskqid_t id)
 {
 	wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
 }
 EXPORT_SYMBOL(taskq_wait_id);
 
 static int
 taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (id < tq->tq_lowest_id);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait_outstanding() function will block until all tasks with a
  * lower taskqid than the passed 'id' have been completed.  Note that all
  * task id's are assigned monotonically at dispatch time.  Zero may be
  * passed for the id to indicate all tasks dispatch up to this point,
  * but not after, should be waited for.
  */
 void
 taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
 {
 	id = id ? id : tq->tq_next_id - 1;
 	wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
 }
 EXPORT_SYMBOL(taskq_wait_outstanding);
 
 static int
 taskq_wait_check(taskq_t *tq)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (tq->tq_lowest_id == tq->tq_next_id);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait() function will block until the taskq is empty.
  * This means that if a taskq re-dispatches work to itself taskq_wait()
  * callers will block indefinitely.
  */
 void
 taskq_wait(taskq_t *tq)
 {
 	wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
 }
 EXPORT_SYMBOL(taskq_wait);
 
 int
 taskq_member(taskq_t *tq, kthread_t *t)
 {
 	return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
 }
 EXPORT_SYMBOL(taskq_member);
 
 taskq_t *
 taskq_of_curthread(void)
 {
 	return (tsd_get(taskq_tsd));
 }
 EXPORT_SYMBOL(taskq_of_curthread);
 
 /*
  * Cancel an already dispatched task given the task id.  Still pending tasks
  * will be immediately canceled, and if the task is active the function will
  * block until it completes.  Preallocated tasks which are canceled must be
  * freed by the caller.
  */
 int
 taskq_cancel_id(taskq_t *tq, taskqid_t id)
 {
 	taskq_ent_t *t;
 	int rc = ENOENT;
 	unsigned long flags;
 
 	ASSERT(tq);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC_LIST(tq, t);
 		TQSTAT_DEC(tq, tasks_total);
 
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
 		TQSTAT_INC(tq, tasks_cancelled);
 
 		/*
 		 * When canceling the lowest outstanding task id we
 		 * must recalculate the new lowest outstanding id.
 		 */
 		if (tq->tq_lowest_id == t->tqent_id) {
 			tq->tq_lowest_id = taskq_lowest_id(tq);
 			ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
 		}
 
 		/*
 		 * The task_expire() function takes the tq->tq_lock so drop
 		 * drop the lock before synchronously cancelling the timer.
 		 */
 		if (timer_pending(&t->tqent_timer)) {
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 			del_timer_sync(&t->tqent_timer);
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 		}
 
 		if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
 			task_done(tq, t);
 
 		rc = 0;
 	}
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	if (t == ERR_PTR(-EBUSY)) {
 		taskq_wait_id(tq, id);
 		rc = EBUSY;
 	}
 
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_cancel_id);
 
 static int taskq_thread_spawn(taskq_t *tq);
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
 	taskq_ent_t *t;
 	taskqid_t rc = TASKQID_INVALID;
 	unsigned long irqflags;
 
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	/* Do not queue the task unless there is idle thread for it */
 	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
 		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 	}
 
 	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
 		goto out;
 
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
 	if (flags & TQ_NOQUEUE) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
 	} else if (flags & TQ_FRONT) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
 	} else {
 		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
 	}
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 	t->tqent_timer.function = NULL;
 	t->tqent_timer.expires = 0;
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_dispatched);
 
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_dispatch);
 
 taskqid_t
 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
     uint_t flags, clock_t expire_time)
 {
 	taskqid_t rc = TASKQID_INVALID;
 	taskq_ent_t *t;
 	unsigned long irqflags;
 
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
 		goto out;
 
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
 	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 	t->tqent_timer.function = task_expire;
 	t->tqent_timer.expires = (unsigned long)expire_time;
 	add_timer(&t->tqent_timer);
 
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
 
 	TQSTAT_INC(tq, tasks_dispatched_delayed);
 
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_dispatch_delay);
 
 void
 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
     taskq_ent_t *t)
 {
 	unsigned long irqflags;
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
 	    tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
 		t->tqent_id = TASKQID_INVALID;
 		goto out;
 	}
 
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
 		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 		flags |= TQ_FRONT;
 	}
 
 	spin_lock(&t->tqent_lock);
 
 	/*
 	 * Make sure the entry is not on some other taskq; it is important to
 	 * ASSERT() under lock
 	 */
 	ASSERT(taskq_empty_ent(t));
 
 	/*
 	 * Mark it as a prealloc'd task.  This is important
 	 * to ensure that we don't free it later.
 	 */
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;
 
 	/* Queue to the priority list instead of the pending list */
 	if (flags & TQ_FRONT) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
 	} else {
 		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
 	}
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_dispatched);
 
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 }
 EXPORT_SYMBOL(taskq_dispatch_ent);
 
 int
 taskq_empty_ent(taskq_ent_t *t)
 {
 	return (list_empty(&t->tqent_list));
 }
 EXPORT_SYMBOL(taskq_empty_ent);
 
 void
 taskq_init_ent(taskq_ent_t *t)
 {
 	spin_lock_init(&t->tqent_lock);
 	init_waitqueue_head(&t->tqent_waitq);
 	timer_setup(&t->tqent_timer, NULL, 0);
 	INIT_LIST_HEAD(&t->tqent_list);
 	t->tqent_id = 0;
 	t->tqent_func = NULL;
 	t->tqent_arg = NULL;
 	t->tqent_flags = 0;
 	t->tqent_taskq = NULL;
 }
 EXPORT_SYMBOL(taskq_init_ent);
 
 /*
  * Return the next pending task, preference is given to tasks on the
  * priority list which were dispatched with TQ_FRONT.
  */
 static taskq_ent_t *
 taskq_next_ent(taskq_t *tq)
 {
 	struct list_head *list;
 
 	if (!list_empty(&tq->tq_prio_list))
 		list = &tq->tq_prio_list;
 	else if (!list_empty(&tq->tq_pend_list))
 		list = &tq->tq_pend_list;
 	else
 		return (NULL);
 
 	return (list_entry(list->next, taskq_ent_t, tqent_list));
 }
 
 /*
  * Spawns a new thread for the specified taskq.
  */
 static void
 taskq_thread_spawn_task(void *arg)
 {
 	taskq_t *tq = (taskq_t *)arg;
 	unsigned long flags;
 
 	if (taskq_thread_create(tq) == NULL) {
 		/* restore spawning count if failed */
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 		tq->tq_nspawn--;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 	}
 }
 
 /*
  * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
  * number of threads is insufficient to handle the pending tasks.  These
  * new threads must be created by the dedicated dynamic_taskq to avoid
  * deadlocks between thread creation and memory reclaim.  The system_taskq
  * which is also a dynamic taskq cannot be safely used for this.
  */
 static int
 taskq_thread_spawn(taskq_t *tq)
 {
 	int spawning = 0;
 
 	if (!(tq->tq_flags & TASKQ_DYNAMIC))
 		return (0);
 
 	tq->lastspawnstop = jiffies;
 	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
 	    (tq->tq_flags & TASKQ_ACTIVE)) {
 		spawning = (++tq->tq_nspawn);
 		taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
 		    tq, TQ_NOSLEEP);
 	}
 
 	return (spawning);
 }
 
 /*
  * Threads in a dynamic taskq may exit once there is no more work to do.
  * To prevent threads from being created and destroyed too often limit
  * the exit rate to one per spl_taskq_thread_timeout_ms.
  *
  * The first thread is the thread list is treated as the primary thread.
  * There is nothing special about the primary thread but in order to avoid
  * all the taskq pids from changing we opt to make it long running.
  */
 static int
 taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
 {
 	ASSERT(!taskq_next_ent(tq));
 	if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
 		return (0);
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		return (1);
 	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
 	    tqt_thread_list) == tqt)
 		return (0);
 	ASSERT3U(tq->tq_nthreads, >, 1);
 	if (tq->tq_nspawn != 0)
 		return (0);
 	if (time_before(jiffies, tq->lastspawnstop +
 	    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
 		return (0);
 	tq->lastspawnstop = jiffies;
 	return (1);
 }
 
 static int
 taskq_thread(void *args)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	sigset_t blocked;
 	taskq_thread_t *tqt = args;
 	taskq_t *tq;
 	taskq_ent_t *t;
 	int seq_tasks = 0;
 	unsigned long flags;
 	taskq_ent_t dup_task = {};
 
 	ASSERT(tqt);
 	ASSERT(tqt->tqt_tq);
 	tq = tqt->tqt_tq;
 	current->flags |= PF_NOFREEZE;
 
 	(void) spl_fstrans_mark();
 
 	sigfillset(&blocked);
 	sigprocmask(SIG_BLOCK, &blocked, NULL);
 	flush_signals(current);
 
 	tsd_set(taskq_tsd, tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	/*
 	 * If we are dynamically spawned, decrease spawning count. Note that
 	 * we could be created during taskq_create, in which case we shouldn't
 	 * do the decrement. But it's fine because taskq_create will reset
 	 * tq_nspawn later.
 	 */
 	if (tq->tq_flags & TASKQ_DYNAMIC)
 		tq->tq_nspawn--;
 
 	/* Immediately exit if more threads than allowed were created. */
 	if (tq->tq_nthreads >= tq->tq_maxthreads)
 		goto error;
 
 	tq->tq_nthreads++;
 	list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	TQSTAT_INC(tq, threads_total);
 
 	while (!kthread_should_stop()) {
 
 		if (list_empty(&tq->tq_pend_list) &&
 		    list_empty(&tq->tq_prio_list)) {
 
 			if (taskq_thread_should_stop(tq, tqt))
 				break;
 
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 			TQSTAT_INC(tq, thread_sleeps);
 			TQSTAT_INC(tq, threads_idle);
 
 			schedule();
 			seq_tasks = 0;
 
 			TQSTAT_DEC(tq, threads_idle);
 			TQSTAT_INC(tq, thread_wakeups);
 
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
 		} else {
 			__set_current_state(TASK_RUNNING);
 		}
 
 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
 			TQSTAT_DEC_LIST(tq, t);
 			TQSTAT_DEC(tq, tasks_total);
 
 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
 			 * during the task function call. Store tqent_id and
 			 * tqent_flags here.
 			 *
 			 * Also use an on stack taskq_ent_t for tqt_task
 			 * assignment in this case; we want to make sure
 			 * to duplicate all fields, so the values are
 			 * correct when it's accessed via DTRACE_PROBE*.
 			 */
 			tqt->tqt_id = t->tqent_id;
 			tqt->tqt_flags = t->tqent_flags;
 
 			if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
 				dup_task = *t;
 				t = &dup_task;
 			}
 			tqt->tqt_task = t;
 
 			taskq_insert_in_order(tq, tqt);
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
 
 			/* Perform the requested task */
 			t->tqent_func(t->tqent_arg);
 
 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
 
 			TQSTAT_DEC(tq, threads_active);
 			if ((t->tqent_flags & TQENT_LIST_MASK) ==
 			    TQENT_LIST_PENDING)
 				TQSTAT_INC(tq, tasks_executed_normal);
 			else
 				TQSTAT_INC(tq, tasks_executed_priority);
 			TQSTAT_INC(tq, tasks_executed);
 
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
 
 			/* For prealloc'd tasks, we don't free anything. */
 			if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
 				task_done(tq, t);
 
 			/*
 			 * When the current lowest outstanding taskqid is
 			 * done calculate the new lowest outstanding id
 			 */
 			if (tq->tq_lowest_id == tqt->tqt_id) {
 				tq->tq_lowest_id = taskq_lowest_id(tq);
 				ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
 			}
 
 			/* Spawn additional taskq threads if required. */
 			if ((++seq_tasks) > spl_taskq_thread_sequential &&
 			    taskq_thread_spawn(tq))
 				seq_tasks = 0;
 
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
 		} else
 			TQSTAT_INC(tq, thread_wakeups_nowork);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
 	}
 
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
 
 	TQSTAT_DEC(tq, threads_total);
 	TQSTAT_INC(tq, threads_destroyed);
 
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	tsd_set(taskq_tsd, NULL);
 	thread_exit();
 
 	return (0);
 }
 
 static taskq_thread_t *
 taskq_thread_create(taskq_t *tq)
 {
 	static int last_used_cpu = 0;
 	taskq_thread_t *tqt;
 
 	tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
 	INIT_LIST_HEAD(&tqt->tqt_thread_list);
 	INIT_LIST_HEAD(&tqt->tqt_active_list);
 	tqt->tqt_tq = tq;
 	tqt->tqt_id = TASKQID_INVALID;
 
 	tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
 	    "%s", tq->tq_name);
 	if (tqt->tqt_thread == NULL) {
 		kmem_free(tqt, sizeof (taskq_thread_t));
 		return (NULL);
 	}
 
 	if (spl_taskq_thread_bind) {
 		last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
 		kthread_bind(tqt->tqt_thread, last_used_cpu);
 	}
 
 	if (spl_taskq_thread_priority)
 		set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
 
 	wake_up_process(tqt->tqt_thread);
 
 	TQSTAT_INC(tq, threads_created);
 
 	return (tqt);
 }
 
 static void
 taskq_stats_init(taskq_t *tq)
 {
 	taskq_sums_t *tqs = &tq->tq_sums;
 	wmsum_init(&tqs->tqs_threads_active, 0);
 	wmsum_init(&tqs->tqs_threads_idle, 0);
 	wmsum_init(&tqs->tqs_threads_total, 0);
 	wmsum_init(&tqs->tqs_tasks_pending, 0);
 	wmsum_init(&tqs->tqs_tasks_priority, 0);
 	wmsum_init(&tqs->tqs_tasks_total, 0);
 	wmsum_init(&tqs->tqs_tasks_delayed, 0);
 	wmsum_init(&tqs->tqs_entries_free, 0);
 	wmsum_init(&tqs->tqs_threads_created, 0);
 	wmsum_init(&tqs->tqs_threads_destroyed, 0);
 	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
 	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
 	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
 	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
 	wmsum_init(&tqs->tqs_tasks_executed, 0);
 	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
 	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
 	wmsum_init(&tqs->tqs_thread_wakeups, 0);
 	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
 	wmsum_init(&tqs->tqs_thread_sleeps, 0);
 }
 
 static void
 taskq_stats_fini(taskq_t *tq)
 {
 	taskq_sums_t *tqs = &tq->tq_sums;
 	wmsum_fini(&tqs->tqs_threads_active);
 	wmsum_fini(&tqs->tqs_threads_idle);
 	wmsum_fini(&tqs->tqs_threads_total);
 	wmsum_fini(&tqs->tqs_tasks_pending);
 	wmsum_fini(&tqs->tqs_tasks_priority);
 	wmsum_fini(&tqs->tqs_tasks_total);
 	wmsum_fini(&tqs->tqs_tasks_delayed);
 	wmsum_fini(&tqs->tqs_entries_free);
 	wmsum_fini(&tqs->tqs_threads_created);
 	wmsum_fini(&tqs->tqs_threads_destroyed);
 	wmsum_fini(&tqs->tqs_tasks_dispatched);
 	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
 	wmsum_fini(&tqs->tqs_tasks_executed_normal);
 	wmsum_fini(&tqs->tqs_tasks_executed_priority);
 	wmsum_fini(&tqs->tqs_tasks_executed);
 	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
 	wmsum_fini(&tqs->tqs_tasks_cancelled);
 	wmsum_fini(&tqs->tqs_thread_wakeups);
 	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
 	wmsum_fini(&tqs->tqs_thread_sleeps);
 }
 
 static int
 taskq_kstats_update(kstat_t *ksp, int rw)
 {
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	taskq_t *tq = ksp->ks_private;
 	taskq_kstats_t *tqks = ksp->ks_data;
 
 	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
 	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
 	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
 
 	taskq_sums_t *tqs = &tq->tq_sums;
 
 	tqks->tqks_threads_active.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_active);
 	tqks->tqks_threads_idle.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_idle);
 	tqks->tqks_threads_total.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_total);
 	tqks->tqks_tasks_pending.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_pending);
 	tqks->tqks_tasks_priority.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_priority);
 	tqks->tqks_tasks_total.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_total);
 	tqks->tqks_tasks_delayed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_delayed);
 	tqks->tqks_entries_free.value.ui64 =
 	    wmsum_value(&tqs->tqs_entries_free);
 	tqks->tqks_threads_created.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_created);
 	tqks->tqks_threads_destroyed.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_destroyed);
 	tqks->tqks_tasks_dispatched.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_dispatched);
 	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
 	tqks->tqks_tasks_executed_normal.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed_normal);
 	tqks->tqks_tasks_executed_priority.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed_priority);
 	tqks->tqks_tasks_executed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed);
 	tqks->tqks_tasks_delayed_requeued.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
 	tqks->tqks_tasks_cancelled.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_cancelled);
 	tqks->tqks_thread_wakeups.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_wakeups);
 	tqks->tqks_thread_wakeups_nowork.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
 	tqks->tqks_thread_sleeps.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_sleeps);
 
 	return (0);
 }
 
 static void
 taskq_kstats_init(taskq_t *tq)
 {
 	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
 	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
 
 	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
 	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (ksp == NULL)
 		return;
 
 	ksp->ks_private = tq;
 	ksp->ks_update = taskq_kstats_update;
 	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
 	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
 	kstat_install(ksp);
 
 	tq->tq_ksp = ksp;
 }
 
 static void
 taskq_kstats_fini(taskq_t *tq)
 {
 	if (tq->tq_ksp == NULL)
 		return;
 
 	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
 	kstat_delete(tq->tq_ksp);
 
 	tq->tq_ksp = NULL;
 }
 
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
 	taskq_t *tq;
 	taskq_thread_t *tqt;
 	int count = 0, rc = 0, i;
 	unsigned long irqflags;
 	int nthreads = threads_arg;
 
 	ASSERT(name != NULL);
 	ASSERT(minalloc >= 0);
 	ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
 
 	/* Scale the number of threads using nthreads as a percentage */
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		ASSERT(nthreads <= 100);
 		ASSERT(nthreads >= 0);
 		nthreads = MIN(threads_arg, 100);
 		nthreads = MAX(nthreads, 0);
 		nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
 	}
 
 	tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
 	if (tq == NULL)
 		return (NULL);
 
 	tq->tq_hp_support = B_FALSE;
 
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		tq->tq_hp_support = B_TRUE;
 		if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
 		    &tq->tq_hp_cb_node) != 0) {
 			kmem_free(tq, sizeof (*tq));
 			return (NULL);
 		}
 	}
 
 	spin_lock_init(&tq->tq_lock);
 	INIT_LIST_HEAD(&tq->tq_thread_list);
 	INIT_LIST_HEAD(&tq->tq_active_list);
 	tq->tq_name = kmem_strdup(name);
 	tq->tq_nactive = 0;
 	tq->tq_nthreads = 0;
 	tq->tq_nspawn = 0;
 	tq->tq_maxthreads = nthreads;
 	tq->tq_cpu_pct = threads_arg;
 	tq->tq_pri = pri;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_nalloc = 0;
 	tq->tq_flags = (flags | TASKQ_ACTIVE);
 	tq->tq_next_id = TASKQID_INITIAL;
 	tq->tq_lowest_id = TASKQID_INITIAL;
 	tq->lastspawnstop = jiffies;
 	INIT_LIST_HEAD(&tq->tq_free_list);
 	INIT_LIST_HEAD(&tq->tq_pend_list);
 	INIT_LIST_HEAD(&tq->tq_prio_list);
 	INIT_LIST_HEAD(&tq->tq_delay_list);
 	init_waitqueue_head(&tq->tq_work_waitq);
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
 	taskq_stats_init(tq);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
 		    tq->tq_lock_class);
 
 		for (i = 0; i < minalloc; i++)
 			task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
 			    &irqflags));
 
 		spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	}
 
 	if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
 		nthreads = 1;
 
 	for (i = 0; i < nthreads; i++) {
 		tqt = taskq_thread_create(tq);
 		if (tqt == NULL)
 			rc = 1;
 		else
 			count++;
 	}
 
 	/* Wait for all threads to be started before potential destroy */
 	wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
 	/*
 	 * taskq_thread might have touched nspawn, but we don't want them to
 	 * because they're not dynamically spawned. So we reset it to 0
 	 */
 	tq->tq_nspawn = 0;
 
 	if (rc) {
 		taskq_destroy(tq);
 		return (NULL);
 	}
 
 	down_write(&tq_list_sem);
 	tq->tq_instance = taskq_find_by_name(name) + 1;
 	list_add_tail(&tq->tq_taskqs, &tq_list);
 	up_write(&tq_list_sem);
 
 	/* Install kstats late, because the name includes tq_instance */
 	taskq_kstats_init(tq);
 
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
 
 void
 taskq_destroy(taskq_t *tq)
 {
 	struct task_struct *thread;
 	taskq_thread_t *tqt;
 	taskq_ent_t *t;
 	unsigned long flags;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	tq->tq_flags &= ~TASKQ_ACTIVE;
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	if (tq->tq_hp_support) {
 		VERIFY0(cpuhp_state_remove_instance_nocalls(
 		    spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
 	}
 
 	/*
 	 * When TASKQ_ACTIVE is clear new tasks may not be added nor may
 	 * new worker threads be spawned for dynamic taskq.
 	 */
 	if (dynamic_taskq != NULL)
 		taskq_wait_outstanding(dynamic_taskq, 0);
 
 	taskq_wait(tq);
 
 	taskq_kstats_fini(tq);
 
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
 	up_write(&tq_list_sem);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	/* wait for spawning threads to insert themselves to the list */
 	while (tq->tq_nspawn) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		schedule_timeout_interruptible(1);
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 	}
 
 	/*
 	 * Signal each thread to exit and block until it does.  Each thread
 	 * is responsible for removing itself from the list and freeing its
 	 * taskq_thread_t.  This allows for idle threads to opt to remove
 	 * themselves from the taskq.  They can be recreated as needed.
 	 */
 	while (!list_empty(&tq->tq_thread_list)) {
 		tqt = list_entry(tq->tq_thread_list.next,
 		    taskq_thread_t, tqt_thread_list);
 		thread = tqt->tqt_thread;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 		kthread_stop(thread);
 
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 	}
 
 	while (!list_empty(&tq->tq_free_list)) {
 		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
 
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 		list_del_init(&t->tqent_list);
 		task_free(tq, t);
 	}
 
 	ASSERT0(tq->tq_nthreads);
 	ASSERT0(tq->tq_nalloc);
 	ASSERT0(tq->tq_nspawn);
 	ASSERT(list_empty(&tq->tq_thread_list));
 	ASSERT(list_empty(&tq->tq_active_list));
 	ASSERT(list_empty(&tq->tq_free_list));
 	ASSERT(list_empty(&tq->tq_pend_list));
 	ASSERT(list_empty(&tq->tq_prio_list));
 	ASSERT(list_empty(&tq->tq_delay_list));
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }
 EXPORT_SYMBOL(taskq_destroy);
 
 /*
  * Create a taskq with a specified number of pool threads. Allocate
  * and return an array of nthreads kthread_t pointers, one for each
  * thread in the pool. The array is not ordered and must be freed
  * by the caller.
  */
 taskq_t *
 taskq_create_synced(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
 {
 	taskq_t *tq;
 	taskq_thread_t *tqt;
 	int i = 0;
 	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
 	    KM_SLEEP);
 
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	/* taskq_create spawns all the threads before returning */
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
 	    flags | TASKQ_PREPOPULATE);
 	VERIFY(tq != NULL);
 	VERIFY(tq->tq_nthreads == nthreads);
 
 	list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
 		kthreads[i] = tqt->tqt_thread;
 		i++;
 	}
 
 	ASSERT3S(i, ==, nthreads);
 	*ktpp = kthreads;
 
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create_synced);
 
 static kstat_t *taskq_summary_ksp = NULL;
 
 static int
 spl_taskq_kstat_headers(char *buf, size_t size)
 {
 	size_t n = snprintf(buf, size,
 	    "%-20s | %-17s | %-23s\n"
 	    "%-20s | %-17s | %-23s\n"
 	    "%-20s | %-17s | %-23s\n",
 	    "", "threads", "tasks on queue",
 	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
 	    "--------------------", "-----------------",
 	    "-----------------------");
 	return (n >= size ? ENOMEM : 0);
 }
 
 static int
 spl_taskq_kstat_data(char *buf, size_t size, void *data)
 {
 	struct list_head *tql = NULL;
 	taskq_t *tq;
 	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
 	char threads[25];
 	char tasks[30];
 	size_t n;
 	int err = 0;
 
 	down_read(&tq_list_sem);
 	list_for_each_prev(tql, &tq_list) {
 		tq = list_entry(tql, taskq_t, tq_taskqs);
 
 		mutex_enter(tq->tq_ksp->ks_lock);
 		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
 		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
 
 		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
 		    tq->tq_instance);
 		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
 		    tqks->tqks_threads_total.value.ui64,
 		    tqks->tqks_threads_active.value.ui64,
 		    tqks->tqks_threads_idle.value.ui64,
 		    tqks->tqks_threads_max.value.ui64);
 		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
 		    tqks->tqks_tasks_total.value.ui64,
 		    tqks->tqks_tasks_pending.value.ui64,
 		    tqks->tqks_tasks_priority.value.ui64,
 		    tqks->tqks_tasks_delayed.value.ui64);
 
 		mutex_exit(tq->tq_ksp->ks_lock);
 
 		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
 		    name, threads, tasks);
 		if (n >= size) {
 			err = ENOMEM;
 			break;
 		}
 
 		buf = &buf[n];
 		size -= n;
 	}
 
 	up_read(&tq_list_sem);
 
 	return (err);
 }
 
 static void
 spl_taskq_kstat_init(void)
 {
 	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 
 	if (ksp == NULL)
 		return;
 
 	ksp->ks_data = (void *)(uintptr_t)1;
 	ksp->ks_ndata = 1;
 	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
 	    spl_taskq_kstat_data, NULL);
 	kstat_install(ksp);
 
 	taskq_summary_ksp = ksp;
 }
 
 static void
 spl_taskq_kstat_fini(void)
 {
 	if (taskq_summary_ksp == NULL)
 		return;
 
 	kstat_delete(taskq_summary_ksp);
 	taskq_summary_ksp = NULL;
 }
 
 static unsigned int spl_taskq_kick = 0;
 
 /*
  * 2.6.36 API Change
  * module_param_cb is introduced to take kernel_param_ops and
  * module_param_call is marked as obsolete. Also set and get operations
  * were changed to take a 'const struct kernel_param *'.
  */
 static int
 #ifdef module_param_cb
 param_set_taskq_kick(const char *val, const struct kernel_param *kp)
 #else
 param_set_taskq_kick(const char *val, struct kernel_param *kp)
 #endif
 {
 	int ret;
 	taskq_t *tq = NULL;
 	taskq_ent_t *t;
 	unsigned long flags;
 
 	ret = param_set_uint(val, kp);
 	if (ret < 0 || !spl_taskq_kick)
 		return (ret);
 	/* reset value */
 	spl_taskq_kick = 0;
 
 	down_read(&tq_list_sem);
 	list_for_each_entry(tq, &tq_list, tq_taskqs) {
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 		/* Check if the first pending is older than 5 seconds */
 		t = taskq_next_ent(tq);
 		if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
 			(void) taskq_thread_spawn(tq);
 			printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
 			    tq->tq_name, tq->tq_instance);
 		}
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 	}
 	up_read(&tq_list_sem);
 	return (ret);
 }
 
 #ifdef module_param_cb
 static const struct kernel_param_ops param_ops_taskq_kick = {
 	.set = param_set_taskq_kick,
 	.get = param_get_uint,
 };
 module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
 #else
 module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
 	&spl_taskq_kick, 0644);
 #endif
 MODULE_PARM_DESC(spl_taskq_kick,
 	"Write nonzero to kick stuck taskqs to spawn more threads");
 
 /*
  * This callback will be called exactly once for each core that comes online,
  * for each dynamic taskq. We attempt to expand taskqs that have
  * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
  * time, to correctly determine whether or not to add a thread.
  */
 static int
 spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
 {
 	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
 	unsigned long flags;
 	int err = 0;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		return (err);
 	}
 
 	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
 	int nthreads = MIN(tq->tq_cpu_pct, 100);
 	nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
 	tq->tq_maxthreads = nthreads;
 
 	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
 	    tq->tq_maxthreads > tq->tq_nthreads) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		taskq_thread_t *tqt = taskq_thread_create(tq);
 		if (tqt == NULL)
 			err = -1;
 		return (err);
 	}
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 	return (err);
 }
 
 /*
  * While we don't support offlining CPUs, it is possible that CPUs will fail
  * to online successfully. We do need to be able to handle this case
  * gracefully.
  */
 static int
 spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
 {
 	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
 	unsigned long flags;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
 	int nthreads = MIN(tq->tq_cpu_pct, 100);
 	nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
 	tq->tq_maxthreads = nthreads;
 
 	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
 	    tq->tq_maxthreads < tq->tq_nthreads) {
 		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
 		taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
 		    taskq_thread_t, tqt_thread_list);
 		struct task_struct *thread = tqt->tqt_thread;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 		kthread_stop(thread);
 
 		return (0);
 	}
 
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 	return (0);
 }
 
 int
 spl_taskq_init(void)
 {
 	init_rwsem(&tq_list_sem);
 	tsd_create(&taskq_tsd, NULL);
 
 	spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
 	    "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
 
 	system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_taskq == NULL)
 		return (-ENOMEM);
 
 	system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_delay_taskq == NULL) {
 		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 		taskq_destroy(system_taskq);
 		return (-ENOMEM);
 	}
 
 	dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
 	if (dynamic_taskq == NULL) {
 		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 		taskq_destroy(system_taskq);
 		taskq_destroy(system_delay_taskq);
 		return (-ENOMEM);
 	}
 
 	/*
 	 * This is used to annotate tq_lock, so
 	 *   taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
 	 * does not trigger a lockdep warning re: possible recursive locking
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
 
 	spl_taskq_kstat_init();
 
 	return (0);
 }
 
 void
 spl_taskq_fini(void)
 {
 	spl_taskq_kstat_fini();
 
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;
 
 	taskq_destroy(system_delay_taskq);
 	system_delay_taskq = NULL;
 
 	taskq_destroy(system_taskq);
 	system_taskq = NULL;
 
 	tsd_destroy(&taskq_tsd);
 
 	cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 	spl_taskq_cpuhp_state = 0;
 }
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 04ab8bbca352..39ea3e62dba0 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1,1352 +1,1351 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
  * See abd.c for a general overview of the arc buffered data (ABD).
  *
  * Linear buffers act exactly like normal buffers and are always mapped into the
  * kernel's virtual memory space, while scattered ABD data chunks are allocated
  * as physical pages and then mapped in only while they are actually being
  * accessed through one of the abd_* library functions. Using scattered ABDs
  * provides several benefits:
  *
  *  (1) They avoid use of kmem_*, preventing performance problems where running
  *      kmem_reap on very large memory systems never finishes and causes
  *      constant TLB shootdowns.
  *
  *  (2) Fragmentation is less of an issue since when we are at the limit of
  *      allocatable space, we won't have to search around for a long free
  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
  *      individually, so even if we are using HIGHMEM (see next point) we
  *      wouldn't need to worry about finding a contiguous address range.
  *
  *  (3) If we are not using HIGHMEM, then all physical memory is always
  *      mapped into the kernel's address space, so we also avoid the map /
  *      unmap costs on each ABD access.
  *
  * If we are not using HIGHMEM, scattered buffers which have only one chunk
  * can be treated as linear buffers, because they are contiguous in the
  * kernel's virtual address space.  See abd_alloc_chunks() for details.
  */
 
 #include <sys/abd_impl.h>
 #include <sys/param.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #include <linux/version.h>
 
 #if defined(MAX_ORDER)
 #define	ABD_MAX_ORDER	(MAX_ORDER)
 #elif defined(MAX_PAGE_ORDER)
 #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
 #endif
 
 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
 	kstat_named_t abdstat_linear_cnt;
 	kstat_named_t abdstat_linear_data_size;
 	kstat_named_t abdstat_scatter_cnt;
 	kstat_named_t abdstat_scatter_data_size;
 	kstat_named_t abdstat_scatter_chunk_waste;
 	kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
 	kstat_named_t abdstat_scatter_page_multi_chunk;
 	kstat_named_t abdstat_scatter_page_multi_zone;
 	kstat_named_t abdstat_scatter_page_alloc_retry;
 	kstat_named_t abdstat_scatter_sg_table_retry;
 } abd_stats_t;
 
 static abd_stats_t abd_stats = {
 	/* Amount of memory occupied by all of the abd_t struct allocations */
 	{ "struct_size",			KSTAT_DATA_UINT64 },
 	/*
 	 * The number of linear ABDs which are currently allocated, excluding
 	 * ABDs which don't own their data (for instance the ones which were
 	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
 	 * ABD takes ownership of its buf then it will become tracked.
 	 */
 	{ "linear_cnt",				KSTAT_DATA_UINT64 },
 	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
 	{ "linear_data_size",			KSTAT_DATA_UINT64 },
 	/*
 	 * The number of scatter ABDs which are currently allocated, excluding
 	 * ABDs which don't own their data (for instance the ones which were
 	 * allocated through abd_get_offset()).
 	 */
 	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
 	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
 	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
 	/*
 	 * The amount of space wasted at the end of the last chunk across all
 	 * scatter ABDs tracked by scatter_cnt.
 	 */
 	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
 	/*
 	 * The number of compound allocations of a given order.  These
 	 * allocations are spread over all currently allocated ABDs, and
 	 * act as a measure of memory fragmentation.
 	 */
 	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
 	/*
 	 * The number of scatter ABDs which contain multiple chunks.
 	 * ABDs are preferentially allocated from the minimum number of
 	 * contiguous multi-page chunks, a single chunk is optimal.
 	 */
 	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
 	/*
 	 * The number of scatter ABDs which are split across memory zones.
 	 * ABDs are preferentially allocated using pages from a single zone.
 	 */
 	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
 	/*
 	 *  The total number of retries encountered when attempting to
 	 *  allocate the pages to populate the scatter ABD.
 	 */
 	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
 	/*
 	 *  The total number of retries encountered when attempting to
 	 *  allocate the sg table for an ABD.
 	 */
 	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
 };
 
 static struct {
 	wmsum_t abdstat_struct_size;
 	wmsum_t abdstat_linear_cnt;
 	wmsum_t abdstat_linear_data_size;
 	wmsum_t abdstat_scatter_cnt;
 	wmsum_t abdstat_scatter_data_size;
 	wmsum_t abdstat_scatter_chunk_waste;
 	wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
 	wmsum_t abdstat_scatter_page_multi_chunk;
 	wmsum_t abdstat_scatter_page_multi_zone;
 	wmsum_t abdstat_scatter_page_alloc_retry;
 	wmsum_t abdstat_scatter_sg_table_retry;
 } abd_sums;
 
 #define	abd_for_each_sg(abd, sg, n, i)	\
 	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
 
 /*
  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
  * ABD's.  Smaller allocations will use linear ABD's which uses
  * zio_[data_]buf_alloc().
  *
  * Scatter ABD's use at least one page each, so sub-page allocations waste
  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
  * half of each page).  Using linear ABD's for small allocations means that
  * they will be put on slabs which contain many allocations.  This can
  * improve memory efficiency, but it also makes it much harder for ARC
  * evictions to actually free pages, because all the buffers on one slab need
  * to be freed in order for the slab (and underlying pages) to be freed.
  * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
  * possible for them to actually waste more memory than scatter (one page per
  * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
  *
  * Spill blocks are typically 512B and are heavily used on systems running
  * selinux with the default dnode size and the `xattr=sa` property set.
  *
  * By default we use linear allocations for 512B and 1KB, and scatter
  * allocations for larger (1.5KB and up).
  */
 static int zfs_abd_scatter_min_size = 512 * 3;
 
 /*
  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
  * just a single zero'd page. This allows us to conserve memory by
  * only using a single zero page for the scatterlist.
  */
 abd_t *abd_zero_scatter = NULL;
 
 struct page;
 
 /*
  * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
  * point to ZERO_PAGE if it is available or it will be an allocated zero'd
  * PAGESIZE buffer.
  */
 static struct page *abd_zero_page = NULL;
 
 static kmem_cache_t *abd_cache = NULL;
 static kstat_t *abd_ksp;
 
 static uint_t
 abd_chunkcnt_for_bytes(size_t size)
 {
 	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
 }
 
 abd_t *
 abd_alloc_struct_impl(size_t size)
 {
 	/*
 	 * In Linux we do not use the size passed in during ABD
 	 * allocation, so we just ignore it.
 	 */
 	(void) size;
 	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
 	ASSERT3P(abd, !=, NULL);
 	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
 
 	return (abd);
 }
 
 void
 abd_free_struct_impl(abd_t *abd)
 {
 	kmem_cache_free(abd_cache, abd);
 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 }
 
 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
 
 /*
  * Mark zfs data pages so they can be excluded from kernel crash dumps
  */
 #ifdef _LP64
 #define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
 
 static inline void
 abd_mark_zfs_page(struct page *page)
 {
 	get_page(page);
 	SetPagePrivate(page);
 	set_page_private(page, ABD_FILE_CACHE_PAGE);
 }
 
 static inline void
 abd_unmark_zfs_page(struct page *page)
 {
 	set_page_private(page, 0UL);
 	ClearPagePrivate(page);
 	put_page(page);
 }
 #else
 #define	abd_mark_zfs_page(page)
 #define	abd_unmark_zfs_page(page)
 #endif /* _LP64 */
 
 #ifndef CONFIG_HIGHMEM
 
 #ifndef __GFP_RECLAIM
 #define	__GFP_RECLAIM		__GFP_WAIT
 #endif
 
 /*
  * The goal is to minimize fragmentation by preferentially populating ABDs
  * with higher order compound pages from a single zone.  Allocation size is
  * progressively decreased until it can be satisfied without performing
  * reclaim or compaction.  When necessary this function will degenerate to
  * allocating individual pages and allowing reclaim to satisfy allocations.
  */
 void
 abd_alloc_chunks(abd_t *abd, size_t size)
 {
 	struct list_head pages;
 	struct sg_table table;
 	struct scatterlist *sg;
 	struct page *page, *tmp_page = NULL;
 	gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
 	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
 	unsigned int max_order = MIN(zfs_abd_scatter_max_order,
 	    ABD_MAX_ORDER - 1);
 	unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
 	unsigned int chunks = 0, zones = 0;
 	size_t remaining_size;
 	int nid = NUMA_NO_NODE;
 	unsigned int alloc_pages = 0;
 
 	INIT_LIST_HEAD(&pages);
 
 	ASSERT3U(alloc_pages, <, nr_pages);
 
 	while (alloc_pages < nr_pages) {
 		unsigned int chunk_pages;
 		unsigned int order;
 
 		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 		chunk_pages = (1U << order);
 
 		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
 		if (page == NULL) {
 			if (order == 0) {
 				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 				schedule_timeout_interruptible(1);
 			} else {
 				max_order = MAX(0, order - 1);
 			}
 			continue;
 		}
 
 		list_add_tail(&page->lru, &pages);
 
 		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
 			zones++;
 
 		nid = page_to_nid(page);
 		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
 		chunks++;
 		alloc_pages += chunk_pages;
 	}
 
 	ASSERT3S(alloc_pages, ==, nr_pages);
 
 	while (sg_alloc_table(&table, chunks, gfp)) {
 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 		schedule_timeout_interruptible(1);
 	}
 
 	sg = table.sgl;
 	remaining_size = size;
 	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
 		size_t sg_size = MIN(PAGESIZE << compound_order(page),
 		    remaining_size);
 		sg_set_page(sg, page, sg_size, 0);
 		abd_mark_zfs_page(page);
 		remaining_size -= sg_size;
 
 		sg = sg_next(sg);
 		list_del(&page->lru);
 	}
 
 	/*
 	 * These conditions ensure that a possible transformation to a linear
 	 * ABD would be valid.
 	 */
 	ASSERT(!PageHighMem(sg_page(table.sgl)));
 	ASSERT0(ABD_SCATTER(abd).abd_offset);
 
 	if (table.nents == 1) {
 		/*
 		 * Since there is only one entry, this ABD can be represented
 		 * as a linear buffer.  All single-page (4K) ABD's can be
 		 * represented this way.  Some multi-page ABD's can also be
 		 * represented this way, if we were able to allocate a single
 		 * "chunk" (higher-order "page" which represents a power-of-2
 		 * series of physically-contiguous pages).  This is often the
 		 * case for 2-page (8K) ABD's.
 		 *
 		 * Representing a single-entry scatter ABD as a linear ABD
 		 * has the performance advantage of avoiding the copy (and
 		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
 		 * A performance increase of around 5% has been observed for
 		 * ARC-cached reads (of small blocks which can take advantage
 		 * of this).
 		 *
 		 * Note that this optimization is only possible because the
 		 * pages are always mapped into the kernel's address space.
 		 * This is not the case for highmem pages, so the
 		 * optimization can not be made there.
 		 */
 		abd->abd_flags |= ABD_FLAG_LINEAR;
 		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
 		abd->abd_u.abd_linear.abd_sgl = table.sgl;
 		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
 	} else if (table.nents > 1) {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
 		if (zones) {
 			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 		}
 
 		ABD_SCATTER(abd).abd_sgl = table.sgl;
 		ABD_SCATTER(abd).abd_nents = table.nents;
 	}
 }
 #else
 
 /*
  * Allocate N individual pages to construct a scatter ABD.  This function
  * makes no attempt to request contiguous pages and requires the minimal
  * number of kernel interfaces.  It's designed for maximum compatibility.
  */
 void
 abd_alloc_chunks(abd_t *abd, size_t size)
 {
 	struct scatterlist *sg = NULL;
 	struct sg_table table;
 	struct page *page;
 	gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO;
 	int nr_pages = abd_chunkcnt_for_bytes(size);
 	int i = 0;
 
 	while (sg_alloc_table(&table, nr_pages, gfp)) {
 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 		schedule_timeout_interruptible(1);
 	}
 
 	ASSERT3U(table.nents, ==, nr_pages);
 	ABD_SCATTER(abd).abd_sgl = table.sgl;
 	ABD_SCATTER(abd).abd_nents = nr_pages;
 
 	abd_for_each_sg(abd, sg, nr_pages, i) {
 		while ((page = __page_cache_alloc(gfp)) == NULL) {
 			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 			schedule_timeout_interruptible(1);
 		}
 
 		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
 		sg_set_page(sg, page, PAGESIZE, 0);
 		abd_mark_zfs_page(page);
 	}
 
 	if (nr_pages > 1) {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 	}
 }
 #endif /* !CONFIG_HIGHMEM */
 
 /*
  * This must be called if any of the sg_table allocation functions
  * are called.
  */
 static void
 abd_free_sg_table(abd_t *abd)
 {
 	struct sg_table table;
 
 	table.sgl = ABD_SCATTER(abd).abd_sgl;
 	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
 	sg_free_table(&table);
 }
 
 void
 abd_free_chunks(abd_t *abd)
 {
 	struct scatterlist *sg = NULL;
 	struct page *page;
 	int nr_pages = ABD_SCATTER(abd).abd_nents;
 	int order, i = 0;
 
 	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
 
 	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 
 	/*
 	 * Scatter ABDs may be constructed by abd_alloc_from_pages() from
 	 * an array of pages. In which case they should not be freed.
 	 */
 	if (!abd_is_from_pages(abd)) {
 		abd_for_each_sg(abd, sg, nr_pages, i) {
 			page = sg_page(sg);
 			abd_unmark_zfs_page(page);
 			order = compound_order(page);
 			__free_pages(page, order);
 			ASSERT3U(sg->length, <=, PAGE_SIZE << order);
 			ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
 		}
 	}
 
 	abd_free_sg_table(abd);
 }
 
 /*
  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
  * the scatterlist will be set to the zero'd out buffer abd_zero_page.
  */
 static void
 abd_alloc_zero_scatter(void)
 {
 	struct scatterlist *sg = NULL;
 	struct sg_table table;
 	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 	int i = 0;
 
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
 	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
 		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 		schedule_timeout_interruptible(1);
 	}
 	abd_mark_zfs_page(abd_zero_page);
 #else
 	abd_zero_page = ZERO_PAGE(0);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 
 	while (sg_alloc_table(&table, nr_pages, gfp)) {
 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 		schedule_timeout_interruptible(1);
 	}
 	ASSERT3U(table.nents, ==, nr_pages);
 
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
 	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
 	}
 
 	ABDSTAT_BUMP(abdstat_scatter_cnt);
 	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 }
 
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
 	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
 }
 
 void
 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 {
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
 	if (op == ABDSTAT_INCR) {
 		ABDSTAT_BUMP(abdstat_scatter_cnt);
 		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 	} else {
 		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
 		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
 		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 	}
 }
 
 void
 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 {
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 	if (op == ABDSTAT_INCR) {
 		ABDSTAT_BUMP(abdstat_linear_cnt);
 		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
 	} else {
 		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
 		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
 	}
 }
 
 void
 abd_verify_scatter(abd_t *abd)
 {
 	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 	    ABD_SCATTER(abd).abd_sgl->length);
 
 #ifdef ZFS_DEBUG
 	struct scatterlist *sg = NULL;
 	size_t n = ABD_SCATTER(abd).abd_nents;
 	int i = 0;
 
 	abd_for_each_sg(abd, sg, n, i) {
 		ASSERT3P(sg_page(sg), !=, NULL);
 	}
 #endif
 }
 
 static void
 abd_free_zero_scatter(void)
 {
 	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
 	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 
 	abd_free_sg_table(abd_zero_scatter);
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
 	ASSERT3P(abd_zero_page, !=, NULL);
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	abd_unmark_zfs_page(abd_zero_page);
 	__free_page(abd_zero_page);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 }
 
 static int
 abd_kstats_update(kstat_t *ksp, int rw)
 {
 	abd_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 	as->abdstat_struct_size.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_struct_size);
 	as->abdstat_linear_cnt.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_linear_cnt);
 	as->abdstat_linear_data_size.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_linear_data_size);
 	as->abdstat_scatter_cnt.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_cnt);
 	as->abdstat_scatter_data_size.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_data_size);
 	as->abdstat_scatter_chunk_waste.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
 	for (int i = 0; i < ABD_MAX_ORDER; i++) {
 		as->abdstat_scatter_orders[i].value.ui64 =
 		    wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
 	}
 	as->abdstat_scatter_page_multi_chunk.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk);
 	as->abdstat_scatter_page_multi_zone.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone);
 	as->abdstat_scatter_page_alloc_retry.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry);
 	as->abdstat_scatter_sg_table_retry.value.ui64 =
 	    wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry);
 	return (0);
 }
 
 void
 abd_init(void)
 {
 	int i;
 
 	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
 	    0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE);
 
 	wmsum_init(&abd_sums.abdstat_struct_size, 0);
 	wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
 	wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
 	for (i = 0; i < ABD_MAX_ORDER; i++)
 		wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0);
 	wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0);
 
 	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (abd_ksp != NULL) {
 		for (i = 0; i < ABD_MAX_ORDER; i++) {
 			snprintf(abd_stats.abdstat_scatter_orders[i].name,
 			    KSTAT_STRLEN, "scatter_order_%d", i);
 			abd_stats.abdstat_scatter_orders[i].data_type =
 			    KSTAT_DATA_UINT64;
 		}
 		abd_ksp->ks_data = &abd_stats;
 		abd_ksp->ks_update = abd_kstats_update;
 		kstat_install(abd_ksp);
 	}
 
 	abd_alloc_zero_scatter();
 }
 
 void
 abd_fini(void)
 {
 	abd_free_zero_scatter();
 
 	if (abd_ksp != NULL) {
 		kstat_delete(abd_ksp);
 		abd_ksp = NULL;
 	}
 
 	wmsum_fini(&abd_sums.abdstat_struct_size);
 	wmsum_fini(&abd_sums.abdstat_linear_cnt);
 	wmsum_fini(&abd_sums.abdstat_linear_data_size);
 	wmsum_fini(&abd_sums.abdstat_scatter_cnt);
 	wmsum_fini(&abd_sums.abdstat_scatter_data_size);
 	wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
 	for (int i = 0; i < ABD_MAX_ORDER; i++)
 		wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
 	wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
 	wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry);
 	wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry);
 
 	if (abd_cache) {
 		kmem_cache_destroy(abd_cache);
 		abd_cache = NULL;
 	}
 }
 
 void
 abd_free_linear_page(abd_t *abd)
 {
 	/* Transform it back into a scatter ABD for freeing */
 	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
 
 	/* When backed by user page unmap it */
 	if (abd_is_from_pages(abd))
 		zfs_kunmap(sg_page(sg));
 	else
 		abd_update_scatter_stats(abd, ABDSTAT_DECR);
 
 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
 	ABD_SCATTER(abd).abd_nents = 1;
 	ABD_SCATTER(abd).abd_offset = 0;
 	ABD_SCATTER(abd).abd_sgl = sg;
 	abd_free_chunks(abd);
 }
 
 /*
  * Allocate a scatter ABD structure from user pages. The pages must be
  * pinned with get_user_pages, or similiar, but need not be mapped via
  * the kmap interfaces.
  */
 abd_t *
 abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size)
 {
 	uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE);
 	struct sg_table table;
 
 	VERIFY3U(size, <=, DMU_MAX_ACCESS);
 	ASSERT3U(offset, <, PAGE_SIZE);
 	ASSERT3P(pages, !=, NULL);
 
 	/*
 	 * Even if this buf is filesystem metadata, we only track that we
 	 * own the underlying data buffer, which is not true in this case.
 	 * Therefore, we don't ever use ABD_FLAG_META here.
 	 */
 	abd_t *abd = abd_alloc_struct(0);
 	abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER;
 	abd->abd_size = size;
 
 	while (sg_alloc_table_from_pages(&table, pages, npages, offset,
 	    size, __GFP_NOWARN | GFP_NOIO) != 0) {
 		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 		schedule_timeout_interruptible(1);
 	}
 
 	if ((offset + size) <= PAGE_SIZE) {
 		/*
 		 * Since there is only one entry, this ABD can be represented
 		 * as a linear buffer. All single-page (4K) ABD's constructed
 		 * from a user page can be represented this way as long as the
 		 * page is mapped to a virtual address. This allows us to
 		 * apply an offset in to the mapped page.
 		 *
 		 * Note that kmap() must be used, not kmap_atomic(), because
 		 * the mapping needs to bet set up on all CPUs. Using kmap()
 		 * also enables the user of highmem pages when required.
 		 */
 		abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE;
 		abd->abd_u.abd_linear.abd_sgl = table.sgl;
 		zfs_kmap(sg_page(table.sgl));
 		ABD_LINEAR_BUF(abd) = sg_virt(table.sgl);
 	} else {
 		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
 		ABD_SCATTER(abd).abd_offset = offset;
 		ABD_SCATTER(abd).abd_sgl = table.sgl;
 		ABD_SCATTER(abd).abd_nents = table.nents;
 
 		ASSERT0(ABD_SCATTER(abd).abd_offset);
 	}
 
 	return (abd);
 }
 
 /*
  * If we're going to use this ABD for doing I/O using the block layer, the
  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
  * plan to store this ABD in memory for a long period of time, we should
  * allocate the ABD type that requires the least data copying to do the I/O.
  *
  * On Linux the optimal thing to do would be to use abd_get_offset() and
  * construct a new ABD which shares the original pages thereby eliminating
  * the copy.  But for the moment a new linear ABD is allocated until this
  * performance optimization can be implemented.
  */
 abd_t *
 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 {
 	return (abd_alloc(size, is_metadata));
 }
 
 abd_t *
 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
     size_t size)
 {
 	(void) size;
 	int i = 0;
 	struct scatterlist *sg = NULL;
 
 	abd_verify(sabd);
 	ASSERT3U(off, <=, sabd->abd_size);
 
 	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
 
 	if (abd == NULL)
 		abd = abd_alloc_struct(0);
 
 	/*
 	 * Even if this buf is filesystem metadata, we only track that
 	 * if we own the underlying data buffer, which is not true in
 	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
 	 */
 
 	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
 		if (new_offset < sg->length)
 			break;
 		new_offset -= sg->length;
 	}
 
 	ABD_SCATTER(abd).abd_sgl = sg;
 	ABD_SCATTER(abd).abd_offset = new_offset;
 	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 
 	if (abd_is_from_pages(sabd))
 		abd->abd_flags |= ABD_FLAG_FROM_PAGES;
 
 	return (abd);
 }
 
 /*
  * Initialize the abd_iter.
  */
 void
 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
 	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
 	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
 }
 
 /*
  * This is just a helper function to see if we have exhausted the
  * abd_iter and reached the end.
  */
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
 	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
 /*
  * Advance the iterator by a certain amount. Cannot be called when a chunk is
  * in use. This can be safely called when the aiter has already exhausted, in
  * which case this does nothing.
  */
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
 	/*
 	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
 	 * this state (directly or abd_iter_unmap()) before advancing.
 	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 	ASSERT3P(aiter->iter_page, ==, NULL);
 	ASSERT0(aiter->iter_page_doff);
 	ASSERT0(aiter->iter_page_dsize);
 
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	aiter->iter_pos += amount;
 	aiter->iter_offset += amount;
 	if (!abd_is_linear(aiter->iter_abd)) {
 		while (aiter->iter_offset >= aiter->iter_sg->length) {
 			aiter->iter_offset -= aiter->iter_sg->length;
 			aiter->iter_sg = sg_next(aiter->iter_sg);
 			if (aiter->iter_sg == NULL) {
 				ASSERT0(aiter->iter_offset);
 				break;
 			}
 		}
 	}
 }
 
 /*
  * Map the current chunk into aiter. This can be safely called when the aiter
  * has already exhausted, in which case this does nothing.
  */
 void
 abd_iter_map(struct abd_iter *aiter)
 {
 	void *paddr;
 	size_t offset = 0;
 
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 
 	/* There's nothing left to iterate over, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	if (abd_is_linear(aiter->iter_abd)) {
 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 		offset = aiter->iter_offset;
 		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
 		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
 	} else {
 		offset = aiter->iter_offset;
 		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
 		    aiter->iter_abd->abd_size - aiter->iter_pos);
 
 		paddr = zfs_kmap_local(sg_page(aiter->iter_sg));
 	}
 
 	aiter->iter_mapaddr = (char *)paddr + offset;
 }
 
 /*
  * Unmap the current chunk from aiter. This can be safely called when the aiter
  * has already exhausted, in which case this does nothing.
  */
 void
 abd_iter_unmap(struct abd_iter *aiter)
 {
 	/* There's nothing left to unmap, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	if (!abd_is_linear(aiter->iter_abd)) {
 		/* LINTED E_FUNC_SET_NOT_USED */
 		zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset);
 	}
 
 	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
 	ASSERT3U(aiter->iter_mapsize, >, 0);
 
 	aiter->iter_mapaddr = NULL;
 	aiter->iter_mapsize = 0;
 }
 
 void
 abd_cache_reap_now(void)
 {
 }
 
 /*
  * Borrow a raw buffer from an ABD without copying the contents of the ABD
  * into the buffer. If the ABD is scattered, this will allocate a raw buffer
  * whose contents are undefined. To copy over the existing data in the ABD, use
  * abd_borrow_buf_copy() instead.
  */
 void *
 abd_borrow_buf(abd_t *abd, size_t n)
 {
 	void *buf;
 	abd_verify(abd);
 	ASSERT3U(abd->abd_size, >=, 0);
 	/*
 	 * In the event the ABD is composed of a single user page from Direct
 	 * I/O we can not direclty return the raw buffer. This is a consequence
 	 * of not being able to write protect the page and the contents of the
 	 * page can be changed at any time by the user.
 	 */
 	if (abd_is_from_pages(abd)) {
 		buf = zio_buf_alloc(n);
 	} else if (abd_is_linear(abd)) {
 		buf = abd_to_buf(abd);
 	} else {
 		buf = zio_buf_alloc(n);
 	}
 
 #ifdef ZFS_DEBUG
 	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
 #endif
 	return (buf);
 }
 
 void *
 abd_borrow_buf_copy(abd_t *abd, size_t n)
 {
 	void *buf = abd_borrow_buf(abd, n);
 
 	/*
 	 * In the event the ABD is composed of a single user page from Direct
 	 * I/O we must make sure copy the data over into the newly allocated
 	 * buffer. This is a consequence of the fact that we can not write
 	 * protect the user page and there is a risk the contents of the page
 	 * could be changed by the user at any moment.
 	 */
 	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
 		abd_copy_to_buf(buf, abd, n);
 	}
 	return (buf);
 }
 
 /*
  * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will
  * not change the contents of the ABD. If you want any changes you made to
  * buf to be copied back to abd, use abd_return_buf_copy() instead. If the
  * ABD is not constructed from user pages for Direct I/O then an ASSERT
  * checks to make sure the contents of buffer have not changed since it was
  * borrowed. We can not ASSERT that the contents of the buffer have not changed
  * if it is composed of user pages because the pages can not be placed under
  * write protection and the user could have possibly changed the contents in
  * the pages at any time. This is also an issue for Direct I/O reads. Checksum
  * verifications in the ZIO pipeline check for this issue and handle it by
  * returning an error on checksum verification failure.
  */
 void
 abd_return_buf(abd_t *abd, void *buf, size_t n)
 {
 	abd_verify(abd);
 	ASSERT3U(abd->abd_size, >=, n);
 #ifdef ZFS_DEBUG
 	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
 #endif
 	if (abd_is_from_pages(abd)) {
 		zio_buf_free(buf, n);
 	} else if (abd_is_linear(abd)) {
 		ASSERT3P(buf, ==, abd_to_buf(abd));
 	} else if (abd_is_gang(abd)) {
 #ifdef ZFS_DEBUG
 		/*
 		 * We have to be careful with gang ABD's that we do not ASSERT0
 		 * for any ABD's that contain user pages from Direct I/O. In
 		 * order to handle this, we just iterate through the gang ABD
 		 * and only verify ABDs that are not from user pages.
 		 */
 		void *cmp_buf = buf;
 
 		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
 		    cabd != NULL;
 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
 			if (!abd_is_from_pages(cabd)) {
 				ASSERT0(abd_cmp_buf(cabd, cmp_buf,
 				    cabd->abd_size));
 			}
 			cmp_buf = (char *)cmp_buf + cabd->abd_size;
 		}
 #endif
 		zio_buf_free(buf, n);
 	} else {
 		ASSERT0(abd_cmp_buf(abd, buf, n));
 		zio_buf_free(buf, n);
 	}
 }
 
 void
 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 {
 	if (!abd_is_linear(abd) || abd_is_from_pages(abd)) {
 		abd_copy_from_buf(abd, buf, n);
 	}
 	abd_return_buf(abd, buf, n);
 }
 
 /*
  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
  * It yields the next page struct and data offset and size within it, without
  * mapping it into the address space.
  */
 
 /*
  * "Compound pages" are a group of pages that can be referenced from a single
  * struct page *. Its organised as a "head" page, followed by a series of
  * "tail" pages.
  *
  * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we
  * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a
  * great many of the IO buffers we get are going to be of this type.
  *
  * The tail pages are just regular PAGESIZE pages, and can be safely used
  * as-is. However, the head page has length covering itself and all the tail
  * pages. If the ABD chunk spans multiple pages, then we can use the head page
  * and a >PAGESIZE length, which is far more efficient.
  *
  * Before kernel 4.5 however, compound page heads were refcounted separately
  * from tail pages, such that moving back to the head page would require us to
  * take a reference to it and releasing it once we're completely finished with
  * it. In practice, that meant when our caller is done with the ABD, which we
  * have no insight into from here. Rather than contort this API to track head
  * page references on such ancient kernels, we disabled this special compound
  * page handling on kernels before 4.5, instead just using treating each page
  * within it as a regular PAGESIZE page (which it is). This is slightly less
  * efficient, but makes everything far simpler.
  *
  * We no longer support kernels before 4.5, so in theory none of this is
  * necessary. However, this code is still relatively new in the grand scheme of
  * things, so I'm leaving the ability to compile this out for the moment.
  *
  * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special
  * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand
  * compound pages, or not, and compiling in/out the support to detect compound
  * tail pages and move back to the start.
  */
 
 /* On by default */
 #define	ABD_ITER_COMPOUND_PAGES
 
 #ifdef ABD_ITER_COMPOUND_PAGES
 #define	ABD_ITER_PAGE_SIZE(page)	\
 	(PageCompound(page) ? page_size(page) : PAGESIZE)
 #else
 #define	ABD_ITER_PAGE_SIZE(page)	(PAGESIZE)
 #endif
 
 void
 abd_iter_page(struct abd_iter *aiter)
 {
 	if (abd_iter_at_end(aiter)) {
 		aiter->iter_page = NULL;
 		aiter->iter_page_doff = 0;
 		aiter->iter_page_dsize = 0;
 		return;
 	}
 
 	struct page *page;
 	size_t doff, dsize;
 
 	/*
 	 * Find the page, and the start of the data within it. This is computed
 	 * differently for linear and scatter ABDs; linear is referenced by
 	 * virtual memory location, while scatter is referenced by page
 	 * pointer.
 	 */
 	if (abd_is_linear(aiter->iter_abd)) {
 		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 
 		/* memory address at iter_pos */
 		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
 
 		/* struct page for address */
 		page = is_vmalloc_addr(paddr) ?
 		    vmalloc_to_page(paddr) : virt_to_page(paddr);
 
 		/* offset of address within the page */
 		doff = offset_in_page(paddr);
 	} else {
 		ASSERT(!abd_is_gang(aiter->iter_abd));
 
 		/* current scatter page */
 		page = nth_page(sg_page(aiter->iter_sg),
 		    aiter->iter_offset >> PAGE_SHIFT);
 
 		/* position within page */
 		doff = aiter->iter_offset & (PAGESIZE - 1);
 	}
 
 #ifdef ABD_ITER_COMPOUND_PAGES
 	if (PageTail(page)) {
 		/*
 		 * If this is a compound tail page, move back to the head, and
 		 * adjust the offset to match. This may let us yield a much
 		 * larger amount of data from a single logical page, and so
 		 * leave our caller with fewer pages to process.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
 #endif
 
 	ASSERT(page);
 
 	/*
 	 * Compute the maximum amount of data we can take from this page. This
 	 * is the smaller of:
 	 * - the remaining space in the page
 	 * - the remaining space in this scatterlist entry (which may not cover
 	 *   the entire page)
 	 * - the remaining space in the abd (which may not cover the entire
 	 *   scatterlist entry)
 	 */
 	dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff,
 	    aiter->iter_abd->abd_size - aiter->iter_pos);
 	if (!abd_is_linear(aiter->iter_abd))
 		dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset);
 	ASSERT3U(dsize, >, 0);
 
 	/* final iterator outputs */
 	aiter->iter_page = page;
 	aiter->iter_page_doff = doff;
 	aiter->iter_page_dsize = dsize;
 }
 
 /*
  * Note: ABD BIO functions only needed to support vdev_classic. See comments in
  * vdev_disk.c.
  */
 
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
  */
 unsigned long
 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
 {
 	unsigned long pos;
 
 	if (abd_is_gang(abd)) {
 		unsigned long count = 0;
 
 		for (abd_t *cabd = abd_gang_get_offset(abd, &off);
 		    cabd != NULL && size != 0;
 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
 			ASSERT3U(off, <, cabd->abd_size);
 			int mysize = MIN(size, cabd->abd_size - off);
 			count += abd_nr_pages_off(cabd, mysize, off);
 			size -= mysize;
 			off = 0;
 		}
 		return (count);
 	}
 
 	if (abd_is_linear(abd))
 		pos = (unsigned long)abd_to_buf(abd) + off;
 	else
 		pos = ABD_SCATTER(abd).abd_offset + off;
 
 	return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
 	    (pos >> PAGE_SHIFT));
 }
 
 static unsigned int
 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
 {
 	unsigned int offset, size, i;
 	struct page *page;
 
 	offset = offset_in_page(buf_ptr);
 	for (i = 0; i < bio->bi_max_vecs; i++) {
 		size = PAGE_SIZE - offset;
 
 		if (bio_size <= 0)
 			break;
 
 		if (size > bio_size)
 			size = bio_size;
 
 		if (is_vmalloc_addr(buf_ptr))
 			page = vmalloc_to_page(buf_ptr);
 		else
 			page = virt_to_page(buf_ptr);
 
 		/*
 		 * Some network related block device uses tcp_sendpage, which
 		 * doesn't behave well when using 0-count page, this is a
 		 * safety net to catch them.
 		 */
 		ASSERT3S(page_count(page), >, 0);
 
 		if (bio_add_page(bio, page, size, offset) != size)
 			break;
 
 		buf_ptr += size;
 		bio_size -= size;
 		offset = 0;
 	}
 
 	return (bio_size);
 }
 
 /*
  * bio_map for gang ABD.
  */
 static unsigned int
 abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
     unsigned int io_size, size_t off)
 {
 	ASSERT(abd_is_gang(abd));
 
 	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
 	    cabd != NULL;
 	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
 		ASSERT3U(off, <, cabd->abd_size);
 		int size = MIN(io_size, cabd->abd_size - off);
 		int remainder = abd_bio_map_off(bio, cabd, size, off);
 		io_size -= (size - remainder);
 		if (io_size == 0 || remainder > 0)
 			return (io_size);
 		off = 0;
 	}
 	ASSERT0(io_size);
 	return (io_size);
 }
 
 /*
  * bio_map for ABD.
  * @off is the offset in @abd
  * Remaining IO size is returned
  */
 unsigned int
 abd_bio_map_off(struct bio *bio, abd_t *abd,
     unsigned int io_size, size_t off)
 {
 	struct abd_iter aiter;
 
 	ASSERT3U(io_size, <=, abd->abd_size - off);
 	if (abd_is_linear(abd))
 		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
 
 	ASSERT(!abd_is_linear(abd));
 	if (abd_is_gang(abd))
 		return (abd_gang_bio_map_off(bio, abd, io_size, off));
 
 	abd_iter_init(&aiter, abd);
 	abd_iter_advance(&aiter, off);
 
 	for (int i = 0; i < bio->bi_max_vecs; i++) {
 		struct page *pg;
 		size_t len, sgoff, pgoff;
 		struct scatterlist *sg;
 
 		if (io_size <= 0)
 			break;
 
 		sg = aiter.iter_sg;
 		sgoff = aiter.iter_offset;
 		pgoff = sgoff & (PAGESIZE - 1);
 		len = MIN(io_size, PAGESIZE - pgoff);
 		ASSERT(len > 0);
 
 		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
 		if (bio_add_page(bio, pg, len, pgoff) != len)
 			break;
 
 		io_size -= len;
 		abd_iter_advance(&aiter, len);
 	}
 
 	return (io_size);
 }
 
 /* Tunable Parameters */
 module_param(zfs_abd_scatter_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
 	"Toggle whether ABD allocations must be linear.");
 module_param(zfs_abd_scatter_min_size, int, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 	"Minimum size of scatter allocations.");
-/* CSTYLED */
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c
index a017900d5538..7d01f8f373b2 100644
--- a/module/os/linux/zfs/zfs_debug.c
+++ b/module/os/linux/zfs/zfs_debug.c
@@ -1,220 +1,218 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/trace_zfs.h>
 
 typedef struct zfs_dbgmsg {
 	procfs_list_node_t	zdm_node;
 	uint64_t		zdm_timestamp;
 	uint_t			zdm_size;
 	char			zdm_msg[]; /* variable length allocation */
 } zfs_dbgmsg_t;
 
 static procfs_list_t zfs_dbgmsgs;
 static uint_t zfs_dbgmsg_size = 0;
 static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
 
 /*
  * Internal ZFS debug messages are enabled by default.
  *
  * # Print debug messages
  * cat /proc/spl/kstat/zfs/dbgmsg
  *
  * # Disable the kernel debug message log.
  * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
  *
  * # Clear the kernel debug message log.
  * echo 0 >/proc/spl/kstat/zfs/dbgmsg
  */
 int zfs_dbgmsg_enable = B_TRUE;
 
 static int
 zfs_dbgmsg_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
 	return (0);
 }
 
 static int
 zfs_dbgmsg_show(struct seq_file *f, void *p)
 {
 	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
 	seq_printf(f, "%-12llu %-s\n",
 	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
 	return (0);
 }
 
 static void
 zfs_dbgmsg_purge(uint_t max_size)
 {
 	while (zfs_dbgmsg_size > max_size) {
 		zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
 		if (zdm == NULL)
 			return;
 
 		uint_t size = zdm->zdm_size;
 		kmem_free(zdm, size);
 		zfs_dbgmsg_size -= size;
 	}
 }
 
 static int
 zfs_dbgmsg_clear(procfs_list_t *procfs_list)
 {
 	(void) procfs_list;
 	mutex_enter(&zfs_dbgmsgs.pl_lock);
 	zfs_dbgmsg_purge(0);
 	mutex_exit(&zfs_dbgmsgs.pl_lock);
 	return (0);
 }
 
 void
 zfs_dbgmsg_init(void)
 {
 	procfs_list_install("zfs",
 	    NULL,
 	    "dbgmsg",
 	    0600,
 	    &zfs_dbgmsgs,
 	    zfs_dbgmsg_show,
 	    zfs_dbgmsg_show_header,
 	    zfs_dbgmsg_clear,
 	    offsetof(zfs_dbgmsg_t, zdm_node));
 }
 
 void
 zfs_dbgmsg_fini(void)
 {
 	procfs_list_uninstall(&zfs_dbgmsgs);
 	zfs_dbgmsg_purge(0);
 
 	procfs_list_destroy(&zfs_dbgmsgs);
 }
 
 void
 __set_error(const char *file, const char *func, int line, int err)
 {
 	/*
 	 * To enable this:
 	 *
 	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
 	 */
 	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
 		__dprintf(B_FALSE, file, func, line, "error %lu",
 		    (ulong_t)err);
 }
 
 void
 __zfs_dbgmsg(char *buf)
 {
 	uint_t size = sizeof (zfs_dbgmsg_t) + strlen(buf) + 1;
 	zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
 	zdm->zdm_size = size;
 	zdm->zdm_timestamp = gethrestime_sec();
 	strcpy(zdm->zdm_msg, buf);
 
 	mutex_enter(&zfs_dbgmsgs.pl_lock);
 	procfs_list_add(&zfs_dbgmsgs, zdm);
 	zfs_dbgmsg_size += size;
 	zfs_dbgmsg_purge(zfs_dbgmsg_maxsize);
 	mutex_exit(&zfs_dbgmsgs.pl_lock);
 }
 
 void
 __dprintf(boolean_t dprint, const char *file, const char *func,
     int line, const char *fmt, ...)
 {
 	const char *newfile;
 	va_list adx;
 	size_t size;
 	char *buf;
 	char *nl;
 	int i;
 	char *prefix = (dprint) ? "dprintf: " : "";
 
 	size = 1024;
 	buf = kmem_alloc(size, KM_SLEEP);
 
 	/*
 	 * Get rid of annoying prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	i = snprintf(buf, size, "%px %s%s:%d:%s(): ",
 	    curthread, prefix, newfile, line, func);
 
 	if (i < size) {
 		va_start(adx, fmt);
 		(void) vsnprintf(buf + i, size - i, fmt, adx);
 		va_end(adx);
 	}
 
 	/*
 	 * Get rid of trailing newline for dprintf logs.
 	 */
 	if (dprint && buf[0] != '\0') {
 		nl = &buf[strlen(buf) - 1];
 		if (*nl == '\n')
 			*nl = '\0';
 	}
 
 	/*
 	 * To get this data enable the zfs__dprintf trace point as shown:
 	 *
 	 * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
 	 * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
 	 * $ echo 0 > /sys/kernel/debug/tracing/trace
 	 *
 	 * # Dump the ring buffer.
 	 * $ cat /sys/kernel/debug/tracing/trace
 	 */
 	DTRACE_PROBE1(zfs__dprintf, char *, buf);
 
 	/*
 	 * To get this data:
 	 *
 	 * $ cat /proc/spl/kstat/zfs/dbgmsg
 	 *
 	 * To clear the buffer:
 	 * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
 	 */
 	__zfs_dbgmsg(buf);
 
 	kmem_free(buf, size);
 }
 
 module_param(zfs_dbgmsg_enable, int, 0644);
 MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
 
-/* BEGIN CSTYLED */
 module_param(zfs_dbgmsg_maxsize, uint, 0644);
-/* END CSTYLED */
 MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index dd9fd760b9c2..a882c88a7a72 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -1,4351 +1,4350 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/vmsystm.h>
 #include <sys/atomic.h>
 #include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_rlock.h>
 #include <sys/cred.h>
 #include <sys/zpl.h>
 #include <sys/zil.h>
 #include <sys/sa_impl.h>
 #include <linux/mm_compat.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using zfs_enter(zfsvfs).
  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  *      can return EIO from the calling function.
  *
  *  (2) zrele() should always be the last thing except for zil_commit() (if
  *	necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  *	last reference, the vnode/znode can be freed, so the zp may point to
  *	freed memory.  Second, the last reference will call zfs_zinactive(),
  *	which may induce a lot of work -- pushing cached pages (which acquires
  *	range locks) and syncing out cached atime changes.  Third,
  *	zfs_zinactive() may require a new tx, which could deadlock the system
  *	if you were already holding one. This deadlock occurs because the tx
  *	currently being operated on prevents a txg from syncing, which
  *	prevents the new tx from progressing, resulting in a deadlock.  If you
  *	must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
  *	is a synonym for zrele().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	zfs_enter(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may igrab())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		zrele(...);		// release held znodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		zfs_exit(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	zrele(...);			// release held znodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	zfs_exit(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 int
 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Honor ZFS_APPENDONLY file attribute */
 	if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & O_APPEND) == 0)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Keep a count of the synchronous opens in the znode.  On first
 	 * synchronous open we must convert all previous async transactions
 	 * into sync to keep correct ordering.
 	 */
 	if (flag & O_SYNC) {
 		if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 			zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 int
 zfs_close(struct inode *ip, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Decrement the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 #if defined(_KERNEL)
 
 static int zfs_fillpage(struct inode *ip, struct page *pp);
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  Update all mapped
  * pages with the contents of the coresponding dmu buffer.
  */
 void
 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 {
 	struct address_space *mp = ZTOI(zp)->i_mapping;
 	int64_t off = start & (PAGE_SIZE - 1);
 
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		uint64_t nbytes = MIN(PAGE_SIZE - off, len);
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			void *pb = kmap(pp);
 			int error = dmu_read(os, zp->z_id, start + off,
 			    nbytes, pb + off, DMU_READ_PREFETCH);
 			kunmap(pp);
 
 			if (error) {
 				SetPageError(pp);
 				ClearPageUptodate(pp);
 			} else {
 				ClearPageError(pp);
 				SetPageUptodate(pp);
 
 				if (mapping_writably_mapped(mp))
 					flush_dcache_page(pp);
 
 				mark_page_accessed(pp);
 			}
 
 			unlock_page(pp);
 			put_page(pp);
 		}
 
 		len -= nbytes;
 		off = 0;
 	}
 }
 
 /*
  * When a file is memory mapped, we must keep the I/O data synchronized
  * between the DMU cache and the memory mapped pages.  Preferentially read
  * from memory mapped pages, otherwise fallback to reading through the dmu.
  */
 int
 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	struct inode *ip = ZTOI(zp);
 	struct address_space *mp = ip->i_mapping;
 	int64_t start = uio->uio_loffset;
 	int64_t off = start & (PAGE_SIZE - 1);
 	int len = nbytes;
 	int error = 0;
 
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		uint64_t bytes = MIN(PAGE_SIZE - off, len);
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 
 			/*
 			 * If filemap_fault() retries there exists a window
 			 * where the page will be unlocked and not up to date.
 			 * In this case we must try and fill the page.
 			 */
 			if (unlikely(!PageUptodate(pp))) {
 				error = zfs_fillpage(ip, pp);
 				if (error) {
 					unlock_page(pp);
 					put_page(pp);
 					return (error);
 				}
 			}
 
 			ASSERT(PageUptodate(pp) || PageDirty(pp));
 
 			unlock_page(pp);
 
 			void *pb = kmap(pp);
 			error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 			kunmap(pp);
 
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			mark_page_accessed(pp);
 			put_page(pp);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 		}
 
 		len -= bytes;
 		off = 0;
 
 		if (error)
 			break;
 	}
 
 	return (error);
 }
 #endif /* _KERNEL */
 
 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	zp	- znode of file to be written to
  *		data	- bytes to write
  *		len	- number of bytes to write
  *		pos	- offset to start writing at
  *
  *	OUT:	resid	- remaining bytes to write
  *
  *	RETURN:	0 if success
  *		positive error code if failure.  EIO is	returned
  *		for a short write when residp isn't provided.
  *
  * Timestamps:
  *	zp - ctime|mtime updated if byte count > 0
  */
 int
 zfs_write_simple(znode_t *zp, const void *data, size_t len,
     loff_t pos, size_t *residp)
 {
 	fstrans_cookie_t cookie;
 	int error;
 
 	struct iovec iov;
 	iov.iov_base = (void *)data;
 	iov.iov_len = len;
 
 	zfs_uio_t uio;
 	zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 
 	cookie = spl_fstrans_mark();
 	error = zfs_write(zp, &uio, 0, kcred);
 	spl_fstrans_unmark(cookie);
 
 	if (error == 0) {
 		if (residp != NULL)
 			*residp = zfs_uio_resid(&uio);
 		else if (zfs_uio_resid(&uio) != 0)
 			error = SET_ERROR(EIO);
 	}
 
 	return (error);
 }
 
 static void
 zfs_rele_async_task(void *arg)
 {
 	iput(arg);
 }
 
 void
 zfs_zrele_async(znode_t *zp)
 {
 	struct inode *ip = ZTOI(zp);
 	objset_t *os = ITOZSB(ip)->z_os;
 
 	ASSERT(atomic_read(&ip->i_count) > 0);
 	ASSERT(os != NULL);
 
 	/*
 	 * If decrementing the count would put us at 0, we can't do it inline
 	 * here, because that would be synchronous. Instead, dispatch an iput
 	 * to run later.
 	 *
 	 * For more information on the dangers of a synchronous iput, see the
 	 * header comment of this file.
 	 */
 	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 		    zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 	}
 }
 
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held inode reference for it.
  *
  *	IN:	zdp	- znode of directory to search.
  *		nm	- name of entry to lookup.
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		cr	- credentials of caller.
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	zpp	- znode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 int
 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
     int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 	int error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*zpp = zdp;
 				zhold(*zpp);
 				return (0);
 			}
 			return (error);
 		}
 	}
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 		return (error);
 
 	*zpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 		    B_TRUE, cr, zfs_init_idmap))) {
 			zrele(*zpp);
 			*zpp = NULL;
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOTDIR));
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 	if ((error == 0) && (*zpp))
 		zfs_znode_update_vfs(*zpp);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Perform a linear search in directory for the name of specific inode.
  * Note we don't pass in the buffer size of name because it's hardcoded to
  * NAME_MAX+1(256) in Linux.
  *
  *	IN:	dzp	- znode of directory to search.
  *		zp	- znode of the target
  *
  *	OUT:	name	- dentry name of the target
  *
  *	RETURN:	0 on success, error code on failure.
  */
 int
 zfs_get_name(znode_t *dzp, char *name, znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 	int error = 0;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/* ctldir should have got their name in zfs_vget */
 	if (dzp->z_is_ctldir || zp->z_is_ctldir) {
 		zfs_exit(zfsvfs, FTAG);
 		return (ENOENT);
 	}
 
 	/* buffer len is hardcoded to 256 in Linux kernel */
 	error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id,
 	    ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the ip of the created or trunc'd file.
  *
  *	IN:	dzp	- znode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- file flag.
  *		vsecp	- ACL to be set
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- znode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dzp - ctime|mtime updated if new entry created
  *	 zp - ctime|mtime always, atime if new
  */
 int
 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
     zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 	boolean_t	skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 top:
 	*zpp = NULL;
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		zhold(dzp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible igrab(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = SET_ERROR(EISDIR);
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 		uint64_t projid = ZFS_DEFAULT_PROJID;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
 		    mnt_ns))) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids, mnt_ns)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 			projid = zfs_inherit_projid(dzp);
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 
 		error = dmu_tx_assign(tx,
 		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				waited = B_TRUE;
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		error = zfs_link_create(dl, zp, tx, ZNEW);
 		if (error != 0) {
 			/*
 			 * Since, we failed to add the directory entry for it,
 			 * delete the newly created dnode.
 			 */
 			zfs_znode_delete(zp, tx);
 			remove_inode_hash(ZTOI(zp));
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_commit(tx);
 			goto out;
 		}
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if (S_ISDIR(ZTOI(zp)->i_mode)) {
 			error = SET_ERROR(EISDIR);
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
 		    mnt_ns))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if (S_ISREG(ZTOI(zp)->i_mode) &&
 		    (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			if (dl) {
 				zfs_dirent_unlock(dl);
 				dl = NULL;
 			}
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 		}
 	}
 out:
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 		*zpp = zp;
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
     zidmap_t *mnt_ns)
 {
 	(void) excl, (void) mode, (void) flag;
 	znode_t		*zp = NULL, *dzp = ITOZ(dip);
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	uint64_t	projid = ZFS_DEFAULT_PROJID;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 top:
 	*ipp = NULL;
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		goto out;
 	}
 
 	if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids, mnt_ns)) != 0)
 		goto out;
 	have_acl = B_TRUE;
 
 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 		projid = zfs_inherit_projid(dzp);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/* Add to unlinked set */
 	zp->z_unlinked = B_TRUE;
 	zfs_unlinked_add(zp, tx);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 out:
 
 	if (error) {
 		if (zp)
 			zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 		*ipp = ZTOI(zp);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dzp	- znode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dzp - ctime|mtime
  *	 ip - ctime (if nlink > 0)
  */
 
 static uint64_t null_xattr = 0;
 
 int
 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 {
 	znode_t		*zp;
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	uint64_t	links;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp))) {
 		if (realnmp)
 			pn_free(realnmp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (S_ISDIR(ZTOI(zp)->i_mode)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	mutex_enter(&zp->z_lock);
 	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 	    !zn_has_cached_data(zp, 0, LLONG_MAX);
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the inode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(zp);
 			if (xzp)
 				zrele(xzp);
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		zrele(zp);
 		if (xzp)
 			zrele(xzp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    atomic_read(&ZTOI(zp)->i_count) == 1 &&
 		    !zn_has_cached_data(zp, 0, LLONG_MAX) &&
 		    xattr_obj == xattr_obj_unlinked &&
 		    zfs_external_acl(zp) == acl_obj;
 		VERIFY_IMPLY(xattr_obj_unlinked, xzp);
 	}
 
 	if (delete_now) {
 		if (xattr_obj_unlinked) {
 			ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = B_TRUE;
 			clear_nlink(ZTOI(xzp));
 			links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &links, sizeof (links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		/*
 		 * Add to the unlinked set because a new reference could be
 		 * taken concurrently resulting in a deferred destruction.
 		 */
 		zfs_unlinked_add(zp, tx);
 		mutex_exit(&zp->z_lock);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 	zfs_znode_update_vfs(dzp);
 	zfs_znode_update_vfs(zp);
 
 	if (delete_now)
 		zrele(zp);
 	else
 		zfs_zrele_async(zp);
 
 	if (xzp) {
 		zfs_znode_update_vfs(xzp);
 		zfs_zrele_async(xzp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dzp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dzp	- znode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *		vsecp	- ACL to be set
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- znode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dzp - ctime|mtime updated
  *	zpp - ctime|mtime|atime updated
  */
 int
 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISDIR(vap->va_mode));
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (dirname == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids, mnt_ns)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*zpp = NULL;
 
 	if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 	    mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	error = zfs_link_create(dl, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		remove_inode_hash(ZTOI(zp));
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	*zpp = zp;
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 out:
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (error != 0) {
 		zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 	}
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dzp	- znode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- inode of current working directory.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dzp - ctime|mtime updated
  */
 int
 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
     int flags)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 		goto out;
 	}
 
 	if (!S_ISDIR(ZTOI(zp)->i_mode)) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	if (zp == cwd) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * Grab a lock on the directory to make sure that no one is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(zp);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zrele(zp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
 		    B_FALSE);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 out:
 	zfs_dirent_unlock(dl);
 
 	zfs_znode_update_vfs(dzp);
 	zfs_znode_update_vfs(zp);
 	zrele(zp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Read directory entries from the given directory cursor position and emit
  * name and position for each entry.
  *
  *	IN:	ip	- inode of directory to read.
  *		ctx	- directory entry context.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 int
 zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
 {
 	(void) cr;
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	objset_t	*os;
 	zap_cursor_t	zc;
 	zap_attribute_t	*zap;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		done = 0;
 	uint64_t	parent;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0)
 		goto out;
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if (zp->z_unlinked)
 		goto out;
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = ctx->pos;
 	prefetch = zp->z_zn_prefetch;
 	zap = zap_attribute_long_alloc();
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	while (!done) {
 		uint64_t objnum;
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap->za_name, ".");
 			zap->za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap->za_name, "..");
 			zap->za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
 			zap->za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if ((error = zap_cursor_retrieve(&zc, zap))) {
 				if (error == ENOENT)
 					break;
 				else
 					goto update;
 			}
 
 			/*
 			 * Allow multiple entries provided the first entry is
 			 * the object id.  Non-zpl consumers may safely make
 			 * use of the additional space.
 			 *
 			 * XXX: This should be a feature flag for compatibility
 			 */
 			if (zap->za_integer_length != 8 ||
 			    zap->za_num_integers == 0) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld, "
 				    "length = %d, num = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset,
 				    zap->za_integer_length,
 				    (u_longlong_t)zap->za_num_integers);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
 			type = ZFS_DIRENT_TYPE(zap->za_first_integer);
 		}
 
 		done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name),
 		    objnum, type);
 		if (done)
 			break;
 
 		if (prefetch)
 			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 		ctx->pos = offset;
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 update:
 	zap_cursor_fini(&zc);
 	zap_attribute_free(zap);
 	if (error == ENOENT)
 		error = 0;
 out:
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Get the basic file attributes and place them in the provided kstat
  * structure.  The inode is assumed to be the authoritative source
  * for most of the attributes.  However, the znode currently has the
  * authoritative atime, blksize, and block count.
  *
  *	IN:	ip	- inode of file.
  *
  *	OUT:	sp	- kstat values.
  *
  *	RETURN:	0 (always succeeds)
  */
 int
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
     struct kstat *sp)
 #else
 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
 #endif
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	mutex_enter(&zp->z_lock);
 
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 	zpl_generic_fillattr(user_ns, request_mask, ip, sp);
 #else
 	zpl_generic_fillattr(user_ns, ip, sp);
 #endif
 	/*
 	 * +1 link count for root inode with visible '.zfs' directory.
 	 */
 	if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
 		if (sp->nlink < ZFS_LINK_MAX)
 			sp->nlink++;
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	sp->blksize = blksize;
 	sp->blocks = nblocks;
 
 	if (unlikely(zp->z_blksz == 0)) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		sp->blksize = zfsvfs->z_max_blksz;
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * Required to prevent NFS client from detecting different inode
 	 * numbers of snapshot root dentry before and after snapshot mount.
 	 */
 	if (zfsvfs->z_issnap) {
 		if (ip->i_sb->s_root->d_inode == ip)
 			sp->ino = ZFSCTL_INO_SNAPDIRS -
 			    dmu_objset_id(zfsvfs->z_os);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (0);
 }
 
 /*
  * For the operation of changing file's user/group/project, we need to
  * handle not only the main object that is assigned to the file directly,
  * but also the ones that are used by the file via hidden xattr directory.
  *
  * Because the xattr directory may contains many EA entries, as to it may
  * be impossible to change all of them via the transaction of changing the
  * main object's user/group/project attributes. Then we have to change them
  * via other multiple independent transactions one by one. It may be not good
  * solution, but we have no better idea yet.
  */
 static int
 zfs_setattr_dir(znode_t *dzp)
 {
 	struct inode	*dxip = ZTOI(dzp);
 	struct inode	*xip = NULL;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	objset_t	*os = zfsvfs->z_os;
 	zap_cursor_t	zc;
 	zap_attribute_t	*zap;
 	zfs_dirlock_t	*dl;
 	znode_t		*zp = NULL;
 	dmu_tx_t	*tx = NULL;
 	uint64_t	uid, gid;
 	sa_bulk_attr_t	bulk[4];
 	int		count;
 	int		err;
 
 	zap = zap_attribute_alloc();
 	zap_cursor_init(&zc, os, dzp->z_id);
 	while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
 		count = 0;
 		if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
 			err = ENXIO;
 			break;
 		}
 
 		err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp,
 		    ZEXISTS, NULL, NULL);
 		if (err == ENOENT)
 			goto next;
 		if (err)
 			break;
 
 		xip = ZTOI(zp);
 		if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
 		    KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
 		    zp->z_projid == dzp->z_projid)
 			goto next;
 
 		tx = dmu_tx_create(os);
 		if (!(zp->z_pflags & ZFS_PROJID))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err)
 			break;
 
 		mutex_enter(&dzp->z_lock);
 
 		if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
 			xip->i_uid = dxip->i_uid;
 			uid = zfs_uid_read(dxip);
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &uid, sizeof (uid));
 		}
 
 		if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
 			xip->i_gid = dxip->i_gid;
 			gid = zfs_gid_read(dxip);
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 			    &gid, sizeof (gid));
 		}
 
 
 		uint64_t projid = dzp->z_projid;
 		if (zp->z_projid != projid) {
 			if (!(zp->z_pflags & ZFS_PROJID)) {
 				err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 				if (unlikely(err == EEXIST)) {
 					err = 0;
 				} else if (err != 0) {
 					goto sa_add_projid_err;
 				} else {
 					projid = ZFS_INVALID_PROJID;
 				}
 			}
 
 			if (projid != ZFS_INVALID_PROJID) {
 				zp->z_projid = projid;
 				SA_ADD_BULK_ATTR(bulk, count,
 				    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 				    sizeof (zp->z_projid));
 			}
 		}
 
 sa_add_projid_err:
 		mutex_exit(&dzp->z_lock);
 
 		if (likely(count > 0)) {
 			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 			dmu_tx_commit(tx);
 		} else if (projid == ZFS_INVALID_PROJID) {
 			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
 		tx = NULL;
 		if (err != 0 && err != ENOENT)
 			break;
 
 next:
 		if (zp) {
 			zrele(zp);
 			zp = NULL;
 			zfs_dirent_unlock(dl);
 		}
 		zap_cursor_advance(&zc);
 	}
 
 	if (tx)
 		dmu_tx_abort(tx);
 	if (zp) {
 		zrele(zp);
 		zfs_dirent_unlock(dl);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(zap);
 
 	return (err == ENOENT ? 0 : err);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	zp	- znode of file to be modified.
  *		vap	- new attribute values.
  *			  If ATTR_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		mnt_ns	- user namespace of the mount
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime updated, mtime updated if size changed.
  */
 int
 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 {
 	struct inode	*ip;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	objset_t	*os;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	*tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_kuid = 0, new_kgid = 0, new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2], atime[2];
 	uint64_t	projid = ZFS_INVALID_PROJID;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2 = 0;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	boolean_t	handle_eadir = B_FALSE;
 	sa_bulk_attr_t	*bulk, *xattr_bulk;
 	int		count = 0, xattr_count = 0, bulks = 8;
 
 	if (mask == 0)
 		return (0);
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 	ip = ZTOI(zp);
 	os = zfsvfs->z_os;
 
 	/*
 	 * If this is a xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 	if (xoap != NULL && (mask & ATTR_XVATTR)) {
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			if (!dmu_objset_projectquota_enabled(os) ||
 			    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 
 			projid = xoap->xoa_projid;
 			if (unlikely(projid == ZFS_INVALID_PROJID)) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
 				projid = ZFS_INVALID_PROJID;
 			else
 				need_policy = TRUE;
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
 		    (xoap->xoa_projinherit !=
 		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
 		    (!dmu_objset_projectquota_enabled(os) ||
 		    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & ATTR_XVATTR))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
 	xva_init(tmpxvattr);
 
 	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 	xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
 	    ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (ATTR_ATIME | ATTR_MTIME)) {
 		if (((mask & ATTR_ATIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & ATTR_MTIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			err = SET_ERROR(EOVERFLOW);
 			goto out3;
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfs_is_readonly(zfsvfs)) {
 		err = SET_ERROR(EROFS);
 		goto out3;
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & ATTR_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
 		    mnt_ns);
 		if (err)
 			goto out3;
 
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err)
 			goto out3;
 	}
 
 	if (mask & (ATTR_ATIME|ATTR_MTIME) ||
 	    ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr, mnt_ns);
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 		int	idmask = (mask & (ATTR_UID|ATTR_GID));
 		int	take_owner;
 		int	take_group;
 		uid_t	uid;
 		gid_t	gid;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & ATTR_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
 		    vap->va_uid);
 		gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
 		    vap->va_gid);
 		take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
 		take_group = (mask & ATTR_GID) &&
 		    zfs_groupmember(zfsvfs, gid, cr);
 
 		/*
 		 * If both ATTR_UID and ATTR_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (ATTR_UID|ATTR_GID)) &&
 		    take_owner && take_group) ||
 		    ((idmask == ATTR_UID) && take_owner) ||
 		    ((idmask == ATTR_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr, mnt_ns) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				(void) secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (ATTR_UID|ATTR_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & ATTR_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			if (xoap->xoa_projinherit !=
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
 				XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((!S_ISREG(ip->i_mode) &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			err = SET_ERROR(EPERM);
 			goto out3;
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & ATTR_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
 		    mnt_ns) == 0) {
 			err = secpolicy_setid_setsticky_clear(ip, vap,
 			    &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
 			if (err)
 				goto out3;
 			trim_mask |= ATTR_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 		}
 		err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
 		    zfs_zaccess_unix, zp);
 		if (err)
 			goto out3;
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
 		handle_eadir = B_TRUE;
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & ATTR_UID) {
 			new_kuid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
 			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
 			    new_kuid)) {
 				if (attrzp)
 					zrele(attrzp);
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			new_kgid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
 			if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
 			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 			    new_kgid)) {
 				if (attrzp)
 					zrele(attrzp);
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (projid != ZFS_INVALID_PROJID &&
 		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
 			if (attrzp)
 				zrele(attrzp);
 			err = EDQUOT;
 			goto out2;
 		}
 	}
 	tx = dmu_tx_create(os);
 
 	if (mask & ATTR_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = EPERM;
 			goto out;
 		}
 
 		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
 			goto out;
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if (((mask & ATTR_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
 		    (projid != ZFS_INVALID_PROJID &&
 		    !(zp->z_pflags & ZFS_PROJID)))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
 		/*
 		 * For the existed object that is upgraded from old system,
 		 * its on-disk layout has no slot for the project ID attribute.
 		 * But quota accounting logic needs to access related slots by
 		 * offset directly. So we need to adjust old objects' layout
 		 * to make the project ID to some unified and fixed offset.
 		 */
 		if (attrzp)
 			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
 		if (err == 0)
 			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 
 		if (unlikely(err == EEXIST))
 			err = 0;
 		else if (err != 0)
 			goto out;
 		else
 			projid = ZFS_INVALID_PROJID;
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 		if (projid != ZFS_INVALID_PROJID) {
 			attrzp->z_projid = projid;
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
 			    sizeof (attrzp->z_projid));
 		}
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 
 		if (mask & ATTR_UID) {
 			ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
 			new_uid = zfs_uid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
 			new_gid = zfs_gid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
 			}
 		}
 		if (!(mask & ATTR_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & ATTR_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = ZTOI(zp)->i_mode = new_mode;
 		ASSERT3P(aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
 		zp->z_atime_dirty = B_FALSE;
 		inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
 		ZFS_TIME_ENCODE(&tmp_atime, atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, sizeof (atime));
 	}
 
 	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		zpl_inode_set_mtime_to_ts(ZTOI(zp),
 		    zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
 		zpl_inode_set_ctime_to_ts(ZTOI(zp),
 		    zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 	}
 
 	if (projid != ZFS_INVALID_PROJID) {
 		zp->z_projid = projid;
 		SA_ADD_BULK_ATTR(bulk, count,
 		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 		    sizeof (zp->z_projid));
 	}
 
 	if (attrzp && mask) {
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
 		    sizeof (ctime));
 	}
 
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & ATTR_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
 			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(S_ISREG(ip->i_mode));
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && xattr_count > 0) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (attrzp)
 			zrele(attrzp);
 		if (err == ERESTART)
 			goto top;
 	} else {
 		if (count > 0)
 			err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 		if (attrzp) {
 			if (err2 == 0 && handle_eadir)
 				err = zfs_setattr_dir(attrzp);
 			zrele(attrzp);
 		}
 		zfs_znode_update_vfs(zp);
 	}
 
 out2:
 	if (os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 out3:
 	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
 	kmem_free(tmpxvattr, sizeof (xvattr_t));
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			zfs_zrele_async(zl->zl_znode);
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = ZTOZSB(zp)->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (SET_ERROR(EINVAL));
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdzp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdzp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *		rflags  - RENAME_* flags
  *		wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
  *		mnt_ns	- user namespace of the mount
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdzp,tdzp - ctime|mtime updated
  */
 int
 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
 {
 	znode_t		*szp, *tzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(sdzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
 	/* Needed for whiteout inode creation. */
 	boolean_t	fuid_dirtied;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	have_acl = B_FALSE;
 	znode_t		*wzp = NULL;
 
 
 	if (snm == NULL || tnm == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return (SET_ERROR(EINVAL));
 
 	/* Already checked by Linux VFS, but just to make sure. */
 	if (rflags & RENAME_EXCHANGE &&
 	    (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
 	 * right kind of vattr_t for the whiteout file. These are set
 	 * internally by ZFS so should never be incorrect.
 	 */
 	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
 	VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
 	VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if ((error = zfs_verify_zp(tdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
 	    zfsctl_is_node(ZTOI(tdzp))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			zfs_exit(zfsvfs, FTAG);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				zrele(tzp);
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		zfs_exit(zfsvfs, FTAG);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		zrele(szp);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		zfs_exit(zfsvfs, FTAG);
 		return (terr);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow renames into our tree when the project
 	 * IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
 		goto out;
 
 	if (S_ISDIR(ZTOI(szp)->i_mode)) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		if (rflags & RENAME_NOREPLACE) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Source and target must be the same type (unless exchanging).
 		 */
 		if (!(rflags & RENAME_EXCHANGE)) {
 			boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
 			boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
 
 			if (s_is_dir != t_is_dir) {
 				error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	} else if (rflags & RENAME_EXCHANGE) {
 		/* Target must exist for RENAME_EXCHANGE. */
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/* Set up inode creation for RENAME_WHITEOUT. */
 	if (rflags & RENAME_WHITEOUT) {
 		/*
 		 * Whiteout files are not regular files or directories, so to
 		 * match zfs_create() we do not inherit the project id.
 		 */
 		uint64_t wo_projid = ZFS_DEFAULT_PROJID;
 
 		error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
 		if (error)
 			goto out;
 
 		if (!have_acl) {
 			error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
 			    &acl_ids, mnt_ns);
 			if (error)
 				goto out;
 			have_acl = B_TRUE;
 		}
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id,
 	    (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 	if (rflags & RENAME_WHITEOUT) {
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
 		dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 	}
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(szp);
 			if (tzp)
 				zrele(tzp);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zrele(szp);
 		if (tzp)
 			zrele(tzp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Unlink the source.
 	 */
 	szp->z_pflags |= ZFS_AV_MODIFIED;
 	if (tdzp->z_pflags & ZFS_PROJINHERIT)
 		szp->z_pflags |= ZFS_PROJINHERIT;
 
 	error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 	    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 	VERIFY0(error);
 
 	error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 	if (error)
 		goto commit;
 
 	/*
 	 * Unlink the target.
 	 */
 	if (tzp) {
 		int tzflg = zflg;
 
 		if (rflags & RENAME_EXCHANGE) {
 			/* This inode will be re-linked soon. */
 			tzflg |= ZRENAMING;
 
 			tzp->z_pflags |= ZFS_AV_MODIFIED;
 			if (sdzp->z_pflags & ZFS_PROJINHERIT)
 				tzp->z_pflags |= ZFS_PROJINHERIT;
 
 			error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
 		if (error)
 			goto commit_link_szp;
 	}
 
 	/*
 	 * Create the new target links:
 	 *   * We always link the target.
 	 *   * RENAME_EXCHANGE: Link the old target to the source.
 	 *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
 	 */
 	error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 	if (error) {
 		/*
 		 * If we have removed the existing target, a subsequent call to
 		 * zfs_link_create() to add back the same entry, but with a new
 		 * dnode (szp), should not fail.
 		 */
 		ASSERT3P(tzp, ==, NULL);
 		goto commit_link_tzp;
 	}
 
 	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 	case RENAME_EXCHANGE:
 		error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
 		/*
 		 * The same argument as zfs_link_create() failing for
 		 * szp applies here, since the source directory must
 		 * have had an entry we are replacing.
 		 */
 		ASSERT0(error);
 		if (error)
 			goto commit_unlink_td_szp;
 		break;
 	case RENAME_WHITEOUT:
 		zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
 		error = zfs_link_create(sdl, wzp, tx, ZNEW);
 		if (error) {
 			zfs_znode_delete(wzp, tx);
 			remove_inode_hash(ZTOI(wzp));
 			goto commit_unlink_td_szp;
 		}
 		break;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 	case RENAME_EXCHANGE:
 		zfs_log_rename_exchange(zilog, tx,
 		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 		    tdzp, tdl->dl_name, szp);
 		break;
 	case RENAME_WHITEOUT:
 		zfs_log_rename_whiteout(zilog, tx,
 		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 		    tdzp, tdl->dl_name, szp, wzp);
 		break;
 	default:
 		ASSERT0(rflags & ~RENAME_NOREPLACE);
 		zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
 		    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
 		break;
 	}
 
 commit:
 	dmu_tx_commit(tx);
 out:
 	if (have_acl)
 		zfs_acl_ids_free(&acl_ids);
 
 	zfs_znode_update_vfs(sdzp);
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 	if (sdzp != tdzp)
 		zfs_znode_update_vfs(tdzp);
 
 	zfs_znode_update_vfs(szp);
 	zrele(szp);
 	if (wzp) {
 		zfs_znode_update_vfs(wzp);
 		zrele(wzp);
 	}
 	if (tzp) {
 		zfs_znode_update_vfs(tzp);
 		zrele(tzp);
 	}
 
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 
 	/*
 	 * Clean-up path for broken link state.
 	 *
 	 * At this point we are in a (very) bad state, so we need to do our
 	 * best to correct the state. In particular, all of the nlinks are
 	 * wrong because we were destroying and creating links with ZRENAMING.
 	 *
 	 * In some form, all of these operations have to resolve the state:
 	 *
 	 *  * link_destroy() *must* succeed. Fortunately, this is very likely
 	 *    since we only just created it.
 	 *
 	 *  * link_create()s are allowed to fail (though they shouldn't because
 	 *    we only just unlinked them and are putting the entries back
 	 *    during clean-up). But if they fail, we can just forcefully drop
 	 *    the nlink value to (at the very least) avoid broken nlink values
 	 *    -- though in the case of non-empty directories we will have to
 	 *    panic (otherwise we'd have a leaked directory with a broken ..).
 	 */
 commit_unlink_td_szp:
 	VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
 commit_link_tzp:
 	if (tzp) {
 		if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
 			VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
 	}
 commit_link_szp:
 	if (zfs_link_create(sdl, szp, tx, ZRENAMING))
 		VERIFY0(zfs_drop_nlink(szp, tx, NULL));
 	goto commit;
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dzp	- Directory to contain new symbolic link.
  *		name	- Name of directory entry in dip.
  *		vap	- Attributes of new entry.
  *		link	- Name for new symlink entry.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- Znode for new symbolic link.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dip - ctime|mtime updated
  */
 int
 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
     znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISLNK(vap->va_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 top:
 	*zpp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datasets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	error = zfs_link_create(dl, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		remove_inode_hash(ZTOI(zp));
 	} else {
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 	}
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (error == 0) {
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 			zil_commit(zilog, 0);
 	} else {
 		zrele(zp);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by ip.
  *
  *	IN:	ip	- inode of symbolic link
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  */
 int
 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
 {
 	(void) cr;
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdzp referencing szp.
  *
  *	IN:	tdzp	- Directory to contain new entry.
  *		szp	- znode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdzp - ctime|mtime updated
  *	 szp - ctime updated
  */
 int
 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
     int flags)
 {
 	struct inode *sip = ZTOI(szp);
 	znode_t		*tzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(tdzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 	boolean_t	waited = B_FALSE;
 	boolean_t	is_tmpfile = 0;
 	uint64_t	txg;
 
 	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
 
 	ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (S_ISDIR(sip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow hard link creation in our tree when the
 	 * project IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
 	    cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
 	if (is_tmpfile)
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, tdzp);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	/* unmark z_unlinked so zfs_link_create will not reject */
 	if (is_tmpfile)
 		szp->z_unlinked = B_FALSE;
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		/*
 		 * tmpfile is created to be in z_unlinkedobj, so remove it.
 		 * Also, we don't log in ZIL, because all previous file
 		 * operation on the tmpfile are ignored by ZIL. Instead we
 		 * always wait for txg to sync to make sure all previous
 		 * operation are sync safe.
 		 */
 		if (is_tmpfile) {
 			VERIFY(zap_remove_int(zfsvfs->z_os,
 			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
 		} else {
 			if (flags & FIGNORECASE)
 				txtype |= TX_CI;
 			zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
 		}
 	} else if (is_tmpfile) {
 		/* restore z_unlinked since when linking failed */
 		szp->z_unlinked = B_TRUE;
 	}
 	txg = dmu_tx_get_txg(tx);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
 
 	zfs_znode_update_vfs(tdzp);
 	zfs_znode_update_vfs(szp);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static void
 zfs_putpage_sync_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 }
 
 static void
 zfs_putpage_async_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 	znode_t *zp = ITOZ(pp->mapping->host);
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 	atomic_dec_32(&zp->z_async_writes_cnt);
 }
 
 /*
  * Push a page out to disk, once the page is on stable storage the
  * registered commit callback will be run as notification of completion.
  *
  *	IN:	ip	 - page mapped for inode.
  *		pp	 - page to push (page is locked)
  *		wbc	 - writeback control data
  *		for_sync - does the caller intend to wait synchronously for the
  *			   page writeback to complete?
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 int
 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
     boolean_t for_sync)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	loff_t		offset;
 	loff_t		pgoff;
 	unsigned int	pglen;
 	dmu_tx_t	*tx;
 	caddr_t		va;
 	int		err = 0;
 	uint64_t	mtime[2], ctime[2];
 	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
 	struct address_space *mapping;
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 
 	ASSERT(PageLocked(pp));
 
 	pgoff = page_offset(pp);	/* Page byte-offset in file */
 	offset = i_size_read(ip);	/* File length in bytes */
 	pglen = MIN(PAGE_SIZE,		/* Page length in bytes */
 	    P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
 
 	/* Page is beyond end of file */
 	if (pgoff >= offset) {
 		unlock_page(pp);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Truncate page length to end of file */
 	if (pgoff + pglen > offset)
 		pglen = offset - pgoff;
 
 #if 0
 	/*
 	 * FIXME: Allow mmap writes past its quota.  The correct fix
 	 * is to register a page_mkwrite() handler to count the page
 	 * against its quota when it is about to be dirtied.
 	 */
 	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 	    KUID_TO_SUID(ip->i_uid)) ||
 	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 	    KGID_TO_SGID(ip->i_gid)) ||
 	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 	    zp->z_projid))) {
 		err = EDQUOT;
 	}
 #endif
 
 	/*
 	 * The ordering here is critical and must adhere to the following
 	 * rules in order to avoid deadlocking in either zfs_read() or
 	 * zfs_free_range() due to a lock inversion.
 	 *
 	 * 1) The page must be unlocked prior to acquiring the range lock.
 	 *    This is critical because zfs_read() calls find_lock_page()
 	 *    which may block on the page lock while holding the range lock.
 	 *
 	 * 2) Before setting or clearing write back on a page the range lock
 	 *    must be held in order to prevent a lock inversion with the
 	 *    zfs_free_range() function.
 	 *
 	 * This presents a problem because upon entering this function the
 	 * page lock is already held.  To safely acquire the range lock the
 	 * page lock must be dropped.  This creates a window where another
 	 * process could truncate, invalidate, dirty, or write out the page.
 	 *
 	 * Therefore, after successfully reacquiring the range and page locks
 	 * the current page state is checked.  In the common case everything
 	 * will be as is expected and it can be written out.  However, if
 	 * the page state has changed it must be handled accordingly.
 	 */
 	mapping = pp->mapping;
 	redirty_page_for_writepage(wbc, pp);
 	unlock_page(pp);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 	    pgoff, pglen, RL_WRITER);
 	lock_page(pp);
 
 	/* Page mapping changed or it was no longer dirty, we're done */
 	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Another process started write block if required */
 	if (PageWriteback(pp)) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			/*
 			 * Speed up any non-sync page writebacks since
 			 * they may take several seconds to complete.
 			 * Refer to the comment in zpl_fsync() for details.
 			 */
 			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
 				zil_commit(zfsvfs->z_log, zp->z_id);
 			}
 
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
 #else
 				wait_on_page_bit(pp, PG_writeback);
 #endif
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Clear the dirty flag the required locks are held */
 	if (!clear_page_dirty_for_io(pp)) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/*
 	 * Counterpart for redirty_page_for_writepage() above.  This page
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
 	if (!for_sync)
 		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
 #else
 		__set_page_dirty_nobuffers(pp);
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
 		if (!for_sync)
 			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (err);
 	}
 
 	va = kmap(pp);
 	ASSERT3U(pglen, <=, PAGE_SIZE);
 	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
 	kunmap(pp);
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/* Preserve the mtime and ctime provided by the inode */
 	tmp_ts = zpl_inode_get_mtime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, mtime);
 	tmp_ts = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_seq++;
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
 	boolean_t commit = B_FALSE;
 	if (wbc->sync_mode != WB_SYNC_NONE) {
 		/*
 		 * Note that this is rarely called under writepages(), because
 		 * writepages() normally handles the entire commit for
 		 * performance reasons.
 		 */
 		commit = B_TRUE;
 	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
 		/*
 		 * If the caller does not intend to wait synchronously
 		 * for this page writeback to complete and there are active
 		 * synchronous calls on this file, do a commit so that
 		 * the latter don't accidentally end up waiting for
 		 * our writeback to complete. Refer to the comment in
 		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
 		 */
 		commit = B_TRUE;
 	}
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit,
 	    B_FALSE, for_sync ? zfs_putpage_sync_commit_cb :
 	    zfs_putpage_async_commit_cb, pp);
 
 	dmu_tx_commit(tx);
 
 	zfs_rangelock_exit(lr);
 
 	if (commit)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 /*
  * Update the system attributes when the inode has been dirtied.  For the
  * moment we only update the mode, atime, mtime, and ctime.
  */
 int
 zfs_dirty_inode(struct inode *ip, int flags)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	dmu_tx_t	*tx;
 	uint64_t	mode, atime[2], mtime[2], ctime[2];
 	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[4];
 	int		error = 0;
 	int		cnt = 0;
 
 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 		return (0);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 #ifdef I_DIRTY_TIME
 	/*
 	 * This is the lazytime semantic introduced in Linux 4.0
 	 * This flag will only be called from update_time when lazytime is set.
 	 * (Note, I_DIRTY_SYNC will also set if not lazytime)
 	 * Fortunately mtime and ctime are managed within ZFS itself, so we
 	 * only need to dirty atime.
 	 */
 	if (flags == I_DIRTY_TIME) {
 		zp->z_atime_dirty = B_TRUE;
 		goto out;
 	}
 #endif
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	mutex_enter(&zp->z_lock);
 	zp->z_atime_dirty = B_FALSE;
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
 	/* Preserve the mode, mtime and ctime provided by the inode */
 	tmp_ts = zpl_inode_get_atime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, atime);
 	tmp_ts = zpl_inode_get_mtime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, mtime);
 	tmp_ts = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	mode = ip->i_mode;
 
 	zp->z_mode = mode;
 
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 	mutex_exit(&zp->z_lock);
 
 	dmu_tx_commit(tx);
 out:
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 void
 zfs_inactive(struct inode *ip)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint64_t atime[2];
 	int error;
 	int need_unlock = 0;
 
 	/* Only read lock if we haven't already write locked, e.g. rollback */
 	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
 		need_unlock = 1;
 		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	}
 	if (zp->z_sa_hdl == NULL) {
 		if (need_unlock)
 			rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			inode_timespec_t tmp_atime;
 			tmp_atime = zpl_inode_get_atime(ip);
 			ZFS_TIME_ENCODE(&tmp_atime, atime);
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&atime, sizeof (atime), tx);
 			zp->z_atime_dirty = B_FALSE;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	if (need_unlock)
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 /*
  * Fill pages with data from the disk.
  */
 static int
 zfs_fillpage(struct inode *ip, struct page *pp)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
 	size_t io_len = PAGE_SIZE;
 
 	ASSERT3U(io_off, <, i_size);
 
 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;
 
 	void *va = kmap(pp);
 	int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
 	kunmap(pp);
 
 	if (error) {
 		/* convert checksum errors into IO errors */
 		if (error == ECKSUM)
 			error = SET_ERROR(EIO);
 
 		SetPageError(pp);
 		ClearPageUptodate(pp);
 	} else {
 		ClearPageError(pp);
 		SetPageUptodate(pp);
 	}
 
 	return (error);
 }
 
 /*
  * Uses zfs_fillpage to read data from the file and fill the page.
  *
  *	IN:	ip	 - inode of file to get data from.
  *		pp	 - page to read
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 int
 zfs_getpage(struct inode *ip, struct page *pp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	znode_t *zp = ITOZ(ip);
 	int error;
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
 	size_t io_len = PAGE_SIZE;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	ASSERT3U(io_off, <, i_size);
 
 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;
 
 	/*
 	 * It is important to hold the rangelock here because it is possible
 	 * a Direct I/O write or block clone might be taking place at the same
 	 * time that a page is being faulted in through filemap_fault(). With
 	 * Direct I/O writes and block cloning db->db_data will be set to NULL
 	 * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
 	 * rangelock is not held, then there is a race between faulting in a
 	 * page and writing out a Direct I/O write or block cloning. Without
 	 * the rangelock a NULL pointer dereference can occur in
 	 * dmu_read_impl() for db->db_data during the mempcy operation when
 	 * zfs_fillpage() calls dmu_read().
 	 */
 	zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
 	    io_off, io_len, RL_READER);
 	if (lr == NULL) {
 		/*
 		 * It is important to drop the page lock before grabbing the
 		 * rangelock to avoid another deadlock between here and
 		 * zfs_write() -> update_pages(). update_pages() holds both the
 		 * rangelock and the page lock.
 		 */
 		get_page(pp);
 		unlock_page(pp);
 		lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
 		    io_len, RL_READER);
 		lock_page(pp);
 		put_page(pp);
 	}
 	error = zfs_fillpage(ip, pp);
 	zfs_rangelock_exit(lr);
 
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Check ZFS specific permissions to memory map a section of a file.
  *
  *	IN:	ip	- inode of the file to mmap
  *		off	- file offset
  *		addrp	- start address in memory region
  *		len	- length of memory region
  *		vm_flags- address flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  */
 int
 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
     unsigned long vm_flags)
 {
 	(void) addrp;
 	znode_t  *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
 	    (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((vm_flags & (VM_READ | VM_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENXIO));
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	zp	- znode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	zp - ctime|mtime updated
  */
 int
 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr)
 {
 	(void) offset;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	uint64_t	off, len;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (cmd != F_FREESP) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (bfp->l_len < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Permissions aren't checked on Solaris because on this OS
 	 * zfs_space() can only be called with an opened file handle.
 	 * On Linux we can get here through truncate_range() which
 	 * operates directly on inodes, so we need to check access rights.
 	 */
 	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_fid(struct inode *ip, fid_t *fidp)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(zfs_open);
 EXPORT_SYMBOL(zfs_close);
 EXPORT_SYMBOL(zfs_lookup);
 EXPORT_SYMBOL(zfs_create);
 EXPORT_SYMBOL(zfs_tmpfile);
 EXPORT_SYMBOL(zfs_remove);
 EXPORT_SYMBOL(zfs_mkdir);
 EXPORT_SYMBOL(zfs_rmdir);
 EXPORT_SYMBOL(zfs_readdir);
 EXPORT_SYMBOL(zfs_getattr_fast);
 EXPORT_SYMBOL(zfs_setattr);
 EXPORT_SYMBOL(zfs_rename);
 EXPORT_SYMBOL(zfs_symlink);
 EXPORT_SYMBOL(zfs_readlink);
 EXPORT_SYMBOL(zfs_link);
 EXPORT_SYMBOL(zfs_inactive);
 EXPORT_SYMBOL(zfs_space);
 EXPORT_SYMBOL(zfs_fid);
 EXPORT_SYMBOL(zfs_getpage);
 EXPORT_SYMBOL(zfs_putpage);
 EXPORT_SYMBOL(zfs_dirty_inode);
 EXPORT_SYMBOL(zfs_map);
 
-/* CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 #endif
diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c
index bbaca2f58394..aff7b1f4dac1 100644
--- a/module/os/linux/zfs/zfs_znode_os.c
+++ b/module/os/linux/zfs/zfs_znode_os.c
@@ -1,1975 +1,1974 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/mntent.h>
 #include <sys/u8_textprep.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/errno.h>
 #include <sys/atomic.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zpl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfs_refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
 #include <linux/mm_compat.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 static kmem_cache_t *znode_cache = NULL;
 static kmem_cache_t *znode_hold_cache = NULL;
 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
 
 /*
  * This is used by the test suite so that it can delay znodes from being
  * freed in order to inspect the unlinked set.
  */
 static int zfs_unlink_suspend_progress = 0;
 
 /*
  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
  * z_rangelock. It will modify the offset and length of the lock to reflect
  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
  * called with the rangelock_t's rl_lock held, which avoids races.
  */
 static void
 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
 {
 	znode_t *zp = arg;
 
 	/*
 	 * If in append mode, convert to writer and lock starting at the
 	 * current end of file.
 	 */
 	if (new->lr_type == RL_APPEND) {
 		new->lr_offset = zp->z_size;
 		new->lr_type = RL_WRITER;
 	}
 
 	/*
 	 * If we need to grow the block size then lock the whole file range.
 	 */
 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
 		new->lr_offset = 0;
 		new->lr_length = UINT64_MAX;
 	}
 }
 
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	(void) arg, (void) kmflags;
 	znode_t *zp = buf;
 
 	inode_init_once(ZTOI(zp));
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_sync_writes_cnt = 0;
 	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
 
 static void
 zfs_znode_cache_destructor(void *buf, void *arg)
 {
 	(void) arg;
 	znode_t *zp = buf;
 
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
 	zfs_rangelock_fini(&zp->z_rangelock);
 
 	ASSERT3P(zp->z_dirlocks, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 
 	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
 	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 static int
 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	(void) arg, (void) kmflags;
 	znode_hold_t *zh = buf;
 
 	mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
 	zh->zh_refcount = 0;
 
 	return (0);
 }
 
 static void
 zfs_znode_hold_cache_destructor(void *buf, void *arg)
 {
 	(void) arg;
 	znode_hold_t *zh = buf;
 
 	mutex_destroy(&zh->zh_lock);
 }
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache.  The KMC_SLAB hint is used in order that it be
 	 * backed by kmalloc() when on the Linux slab in order that any
 	 * wait_on_bit() operations on the related inode operate properly.
 	 */
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL,
 	    KMC_SLAB | KMC_RECLAIMABLE);
 
 	ASSERT(znode_hold_cache == NULL);
 	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_znode_fini(void)
 {
 	/*
 	 * Cleanup zcache
 	 */
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 
 	if (znode_hold_cache)
 		kmem_cache_destroy(znode_hold_cache);
 	znode_hold_cache = NULL;
 }
 
 /*
  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
  * serialize access to a znode and its SA buffer while the object is being
  * created or destroyed.  This kind of locking would normally reside in the
  * znode itself but in this case that's impossible because the znode and SA
  * buffer may not yet exist.  Therefore the locking is handled externally
  * with an array of mutexes and AVLs trees which contain per-object locks.
  *
  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
  * in to the correct AVL tree and finally the per-object lock is held.  In
  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
  * released, removed from the AVL tree and destroyed if there are no waiters.
  *
  * This scheme has two important properties:
  *
  * 1) No memory allocations are performed while holding one of the z_hold_locks.
  *    This ensures evict(), which can be called from direct memory reclaim, will
  *    never block waiting on a z_hold_locks which just happens to have hashed
  *    to the same index.
  *
  * 2) All locks used to serialize access to an object are per-object and never
  *    shared.  This minimizes lock contention without creating a large number
  *    of dedicated locks.
  *
  * On the downside it does require znode_lock_t structures to be frequently
  * allocated and freed.  However, because these are backed by a kmem cache
  * and very short lived this cost is minimal.
  */
 int
 zfs_znode_hold_compare(const void *a, const void *b)
 {
 	const znode_hold_t *zh_a = (const znode_hold_t *)a;
 	const znode_hold_t *zh_b = (const znode_hold_t *)b;
 
 	return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
 }
 
 static boolean_t __maybe_unused
 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
 {
 	znode_hold_t *zh, search;
 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
 	boolean_t held;
 
 	search.zh_obj = obj;
 
 	mutex_enter(&zfsvfs->z_hold_locks[i]);
 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 	held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
 	mutex_exit(&zfsvfs->z_hold_locks[i]);
 
 	return (held);
 }
 
 znode_hold_t *
 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
 {
 	znode_hold_t *zh, *zh_new, search;
 	int i = ZFS_OBJ_HASH(zfsvfs, obj);
 	boolean_t found = B_FALSE;
 
 	zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
 	search.zh_obj = obj;
 
 	mutex_enter(&zfsvfs->z_hold_locks[i]);
 	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 	if (likely(zh == NULL)) {
 		zh = zh_new;
 		zh->zh_obj = obj;
 		avl_add(&zfsvfs->z_hold_trees[i], zh);
 	} else {
 		ASSERT3U(zh->zh_obj, ==, obj);
 		found = B_TRUE;
 	}
 	zh->zh_refcount++;
 	ASSERT3S(zh->zh_refcount, >, 0);
 	mutex_exit(&zfsvfs->z_hold_locks[i]);
 
 	if (found == B_TRUE)
 		kmem_cache_free(znode_hold_cache, zh_new);
 
 	ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
 	mutex_enter(&zh->zh_lock);
 
 	return (zh);
 }
 
 void
 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
 {
 	int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
 	boolean_t remove = B_FALSE;
 
 	ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
 	mutex_exit(&zh->zh_lock);
 
 	mutex_enter(&zfsvfs->z_hold_locks[i]);
 	ASSERT3S(zh->zh_refcount, >, 0);
 	if (--zh->zh_refcount == 0) {
 		avl_remove(&zfsvfs->z_hold_trees[i], zh);
 		remove = B_TRUE;
 	}
 	mutex_exit(&zfsvfs->z_hold_locks[i]);
 
 	if (remove == B_TRUE)
 		kmem_cache_free(znode_hold_cache, zh);
 }
 
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 	return (dev);
 }
 
 static void
 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
 	ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
 
 	mutex_enter(&zp->z_lock);
 
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
 	} else {
 		zp->z_sa_hdl = sa_hdl;
 		sa_set_userp(sa_hdl, zp);
 	}
 
 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	mutex_exit(&zp->z_lock);
 }
 
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
 	ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
 	    RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
 
 	sa_handle_destroy(zp->z_sa_hdl);
 	zp->z_sa_hdl = NULL;
 }
 
 /*
  * Called by new_inode() to allocate a new inode.
  */
 int
 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 {
 	znode_t *zp;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	*ip = ZTOI(zp);
 
 	return (0);
 }
 
 /*
  * Called in multiple places when an inode should be destroyed.
  */
 void
 zfs_inode_destroy(struct inode *ip)
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	if (list_link_active(&zp->z_link_node)) {
 		list_remove(&zfsvfs->z_all_znodes, zp);
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	if (zp->z_xattr_cached) {
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
 
 	kmem_cache_free(znode_cache, zp);
 }
 
 static void
 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 {
 	uint64_t rdev = 0;
 
 	switch (ip->i_mode & S_IFMT) {
 	case S_IFREG:
 		ip->i_op = &zpl_inode_operations;
 		ip->i_fop = &zpl_file_operations;
 		ip->i_mapping->a_ops = &zpl_address_space_operations;
 		break;
 
 	case S_IFDIR:
 		ip->i_op = &zpl_dir_inode_operations;
 		ip->i_fop = &zpl_dir_file_operations;
 		ITOZ(ip)->z_zn_prefetch = B_TRUE;
 		break;
 
 	case S_IFLNK:
 		ip->i_op = &zpl_symlink_inode_operations;
 		break;
 
 	/*
 	 * rdev is only stored in a SA only for device files.
 	 */
 	case S_IFCHR:
 	case S_IFBLK:
 		(void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
 		    sizeof (rdev));
 		zfs_fallthrough;
 	case S_IFIFO:
 	case S_IFSOCK:
 		init_special_inode(ip, ip->i_mode, rdev);
 		ip->i_op = &zpl_special_inode_operations;
 		break;
 
 	default:
 		zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
 		    (u_longlong_t)ip->i_ino, ip->i_mode);
 
 		/* Assume the inode is a file and attempt to continue */
 		ip->i_mode = S_IFREG | 0644;
 		ip->i_op = &zpl_inode_operations;
 		ip->i_fop = &zpl_file_operations;
 		ip->i_mapping->a_ops = &zpl_address_space_operations;
 		break;
 	}
 }
 
 static void
 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
 {
 	/*
 	 * Linux and Solaris have different sets of file attributes, so we
 	 * restrict this conversion to the intersection of the two.
 	 */
 	unsigned int flags = 0;
 	if (zp->z_pflags & ZFS_IMMUTABLE)
 		flags |= S_IMMUTABLE;
 	if (zp->z_pflags & ZFS_APPENDONLY)
 		flags |= S_APPEND;
 
 	inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
 }
 
 /*
  * Update the embedded inode given the znode.
  */
 void
 zfs_znode_update_vfs(znode_t *zp)
 {
 	struct inode	*ip;
 	uint32_t	blksize;
 	u_longlong_t	i_blocks;
 
 	ASSERT(zp != NULL);
 	ip = ZTOI(zp);
 
 	/* Skip .zfs control nodes which do not exist on disk. */
 	if (zfsctl_is_node(ip))
 		return;
 
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
 
 	spin_lock(&ip->i_lock);
 	ip->i_mode = zp->z_mode;
 	ip->i_blocks = i_blocks;
 	i_size_write(ip, zp->z_size);
 	spin_unlock(&ip->i_lock);
 }
 
 
 /*
  * Construct a znode+inode and initialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
     dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
 	znode_t	*zp;
 	struct inode *ip;
 	uint64_t mode;
 	uint64_t parent;
 	uint64_t tmp_gen;
 	uint64_t links;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
 	inode_timespec_t tmp_ts;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	sa_bulk_attr_t bulk[12];
 	int count = 0;
 
 	ASSERT(zfsvfs != NULL);
 
 	ip = new_inode(zfsvfs->z_sb);
 	if (ip == NULL)
 		return (NULL);
 
 	zp = ITOZ(ip);
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_is_ctldir = B_FALSE;
 	zp->z_suspended = B_FALSE;
 	zp->z_sa_hdl = NULL;
 	zp->z_mapcnt = 0;
 	zp->z_id = db->db_object;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 	zp->z_sync_writes_cnt = 0;
 	zp->z_async_writes_cnt = 0;
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
 	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 	    (zp->z_pflags & ZFS_PROJID) &&
 	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 		if (hdl == NULL)
 			sa_handle_destroy(zp->z_sa_hdl);
 		zp->z_sa_hdl = NULL;
 		goto error;
 	}
 
 	zp->z_projid = projid;
 	zp->z_mode = ip->i_mode = mode;
 	ip->i_generation = (uint32_t)tmp_gen;
 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
 	set_nlink(ip, (uint32_t)links);
 	zfs_uid_write(ip, z_uid);
 	zfs_gid_write(ip, z_gid);
 	zfs_set_inode_flags(zp, ip);
 
 	/* Cache the xattr parent id */
 	if (zp->z_pflags & ZFS_XATTR)
 		zp->z_xattr_parent = parent;
 
 	ZFS_TIME_DECODE(&tmp_ts, atime);
 	zpl_inode_set_atime_to_ts(ip, tmp_ts);
 	ZFS_TIME_DECODE(&tmp_ts, mtime);
 	zpl_inode_set_mtime_to_ts(ip, tmp_ts);
 	ZFS_TIME_DECODE(&tmp_ts, ctime);
 	zpl_inode_set_ctime_to_ts(ip, tmp_ts);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);
 
 	ip->i_ino = zp->z_id;
 	zfs_znode_update_vfs(zp);
 	zfs_inode_set_ops(zfsvfs, ip);
 
 	/*
 	 * The only way insert_inode_locked() can fail is if the ip->i_ino
 	 * number is already hashed for this super block.  This can never
 	 * happen because the inode numbers map 1:1 with the object numbers.
 	 *
 	 * Exceptions include rolling back a mounted file system, either
 	 * from the zfs rollback or zfs recv command.
 	 *
 	 * Active inodes are unhashed during the rollback, but since zrele
 	 * can happen asynchronously, we can't guarantee they've been
 	 * unhashed.  This can cause hash collisions in unlinked drain
 	 * processing so do not hash unlinked znodes.
 	 */
 	if (links > 0)
 		VERIFY3S(insert_inode_locked(ip), ==, 0);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	if (links > 0)
 		unlock_new_inode(ip);
 	return (zp);
 
 error:
 	iput(ip);
 	return (NULL);
 }
 
 /*
  * Safely mark an inode dirty.  Inodes which are part of a read-only
  * file system or snapshot may not be dirtied.
  */
 void
 zfs_mark_inode_dirty(struct inode *ip)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 
 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 		return;
 
 	mark_inode_dirty(ip);
 }
 
 static uint64_t empty_xattr;
 static uint64_t pad[4];
 static zfs_acl_phys_t acl_phys;
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_TMPFILE	- new object is of O_TMPFILE
  *			  IS_XATTR	- new object is an attribute
  *		acl_ids	- ACL related attributes
  *
  *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
 	uint64_t	mode, size, links, parent, pflags;
 	uint64_t	projid = ZFS_DEFAULT_PROJID;
 	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	dmu_buf_t	*db;
 	inode_timespec_t now;
 	uint64_t	gen, obj;
 	int		bonuslen;
 	int		dnodesize;
 	sa_handle_t	*sa_hdl;
 	dmu_object_type_t obj_type;
 	sa_bulk_attr_t	*sa_attrs;
 	int		cnt = 0;
 	zfs_acl_locator_cb_t locate = { 0 };
 	znode_hold_t	*zh;
 
 	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 		dnodesize = vap->va_fsid;	/* ditto */
 	} else {
 		obj = 0;
 		gethrestime(&now);
 		gen = dmu_tx_get_txg(tx);
 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 	}
 
 	if (dnodesize == 0)
 		dnodesize = DNODE_MIN_SIZE;
 
 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 
 	bonuslen = (obj_type == DMU_OT_SA) ?
 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (S_ISDIR(vap->va_mode)) {
 		if (zfsvfs->z_replay) {
 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, dnodesize, tx));
 		} else {
 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, dnodesize, tx);
 		}
 	} else {
 		if (zfsvfs->z_replay) {
 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, dnodesize, tx));
 		} else {
 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, dnodesize, tx);
 		}
 	}
 
 	zh = zfs_znode_hold_enter(zfsvfs, obj);
 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_id = obj;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp->z_pflags & ZFS_XATTR) {
 		flag |= IS_XATTR;
 	}
 
 	if (zfsvfs->z_use_fuids)
 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 	else
 		pflags = 0;
 
 	if (S_ISDIR(vap->va_mode)) {
 		size = 2;		/* contents ("." and "..") */
 		links = 2;
 	} else {
 		size = 0;
 		links = (flag & IS_TMPFILE) ? 0 : 1;
 	}
 
 	if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
 		rdev = vap->va_rdev;
 
 	parent = dzp->z_id;
 	mode = acl_ids->z_mode;
 	if (flag & IS_XATTR)
 		pflags |= ZFS_XATTR;
 
 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
 		/*
 		 * With ZFS_PROJID flag, we can easily know whether there is
 		 * project ID stored on disk or not. See zfs_space_delta_cb().
 		 */
 		if (obj_type != DMU_OT_ZNODE &&
 		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
 			pflags |= ZFS_PROJID;
 
 		/*
 		 * Inherit project ID from parent if required.
 		 */
 		projid = zfs_inherit_projid(dzp);
 		if (dzp->z_pflags & ZFS_PROJINHERIT)
 			pflags |= ZFS_PROJINHERIT;
 	}
 
 	/*
 	 * No execs denied will be determined when zfs_mode_compute() is called.
 	 */
 	pflags |= acl_ids->z_aclp->z_hints &
 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
 	ZFS_TIME_ENCODE(&now, crtime);
 	ZFS_TIME_ENCODE(&now, ctime);
 
 	if (vap->va_mask & ATTR_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, atime);
 	}
 
 	if (vap->va_mask & ATTR_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, mtime);
 	}
 
 	/* Now add in all of the "SA" attributes */
 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 	    &sa_hdl));
 
 	/*
 	 * Setup the array of attributes to be replaced/set on the new file
 	 *
 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 	 * in the old znode_phys_t format.  Don't change this ordering
 	 */
 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 	} else {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 		    NULL, &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 		    NULL, &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 	}
 
 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &empty_xattr, 8);
 	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 	    pflags & ZFS_PROJID) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
 		    NULL, &projid, 8);
 	}
 	if (obj_type == DMU_OT_ZNODE ||
 	    (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 		    NULL, &rdev, 8);
 	}
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 		    sizeof (uint64_t) * 4);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &acl_phys, sizeof (zfs_acl_phys_t));
 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &acl_ids->z_aclp->z_acl_count, 8);
 		locate.cb_aclp = acl_ids->z_aclp;
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    acl_ids->z_aclp->z_acl_bytes);
 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 		    acl_ids->z_fuid, acl_ids->z_fgid);
 	}
 
 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 
 	if (!(flag & IS_ROOT_NODE)) {
 		/*
 		 * The call to zfs_znode_alloc() may fail if memory is low
 		 * via the call path: alloc_inode() -> inode_init_always() ->
 		 * security_inode_alloc() -> inode_alloc_security().  Since
 		 * the existing code is written such that zfs_mknode() can
 		 * not fail retry until sufficient memory has been reclaimed.
 		 */
 		do {
 			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 		} while (*zpp == NULL);
 
 		VERIFY(*zpp != NULL);
 		VERIFY(dzp != NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
 		 * passed in is the znode for the root.
 		 */
 		*zpp = dzp;
 
 		(*zpp)->z_sa_hdl = sa_hdl;
 	}
 
 	(*zpp)->z_pflags = pflags;
 	(*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
 	(*zpp)->z_dnodesize = dnodesize;
 	(*zpp)->z_projid = projid;
 
 	if (obj_type == DMU_OT_ZNODE ||
 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 	zfs_znode_hold_exit(zfsvfs, zh);
 }
 
 /*
  * Update in-core attributes.  It is assumed the caller will be doing an
  * sa_bulk_update to push the changes out.
  */
 void
 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
 	xoptattr_t *xoap;
 	boolean_t update_inode = B_FALSE;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 		uint64_t times[2];
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 		    &times, sizeof (times), tx);
 		XVA_SET_RTN(xvap, XAT_CREATETIME);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_READONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_HIDDEN);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SYSTEM);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 
 		update_inode = B_TRUE;
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
 
 		update_inode = B_TRUE;
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NODUMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OPAQUE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		zfs_sa_set_scanstamp(zp, xvap, tx);
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_REPARSE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OFFLINE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SPARSE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
 	}
 
 	if (update_inode)
 		zfs_set_inode_flags(zp, ZTOI(zp));
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	znode_hold_t	*zh;
 	int err;
 	sa_handle_t	*hdl;
 
 	*zpp = NULL;
 
 again:
 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
 
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (SET_ERROR(EINVAL));
 	}
 
 	hdl = dmu_buf_get_user(db);
 	if (hdl != NULL) {
 		zp = sa_get_userdata(hdl);
 
 
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
 
 		ASSERT3P(zp, !=, NULL);
 
 		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
 		/*
 		 * If zp->z_unlinked is set, the znode is already marked
 		 * for deletion and should not be discovered. Check this
 		 * after checking igrab() due to fsetxattr() & O_TMPFILE.
 		 *
 		 * If igrab() returns NULL the VFS has independently
 		 * determined the inode should be evicted and has
 		 * called iput_final() to start the eviction process.
 		 * The SA handle is still valid but because the VFS
 		 * requires that the eviction succeed we must drop
 		 * our locks and references to allow the eviction to
 		 * complete.  The zfs_zget() may then be retried.
 		 *
 		 * This unlikely case could be optimized by registering
 		 * a sops->drop_inode() callback.  The callback would
 		 * need to detect the active SA hold thereby informing
 		 * the VFS that this inode should not be evicted.
 		 */
 		if (igrab(ZTOI(zp)) == NULL) {
 			if (zp->z_unlinked)
 				err = SET_ERROR(ENOENT);
 			else
 				err = SET_ERROR(EAGAIN);
 		} else {
 			*zpp = zp;
 			err = 0;
 		}
 
 		mutex_exit(&zp->z_lock);
 		sa_buf_rele(db, NULL);
 		zfs_znode_hold_exit(zfsvfs, zh);
 
 		if (err == EAGAIN) {
 			/* inode might need this to finish evict */
 			cond_resched();
 			goto again;
 		}
 		return (err);
 	}
 
 	/*
 	 * Not found create new znode/vnode but only if file exists.
 	 *
 	 * There is a small window where zfs_vget() could
 	 * find this object while a file create is still in
 	 * progress.  This is checked for in zfs_znode_alloc()
 	 *
 	 * if zfs_znode_alloc() fails it will drop the hold on the
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
 	    doi.doi_bonus_type, NULL);
 	if (zp == NULL) {
 		err = SET_ERROR(ENOENT);
 	} else {
 		*zpp = zp;
 	}
 	zfs_znode_hold_exit(zfsvfs, zh);
 	return (err);
 }
 
 int
 zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	uint64_t obj_num = zp->z_id;
 	uint64_t mode;
 	uint64_t links;
 	sa_bulk_attr_t bulk[11];
 	int err;
 	int count = 0;
 	uint64_t gen;
 	uint64_t z_uid, z_gid;
 	uint64_t atime[2], mtime[2], ctime[2], btime[2];
 	inode_timespec_t tmp_ts;
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	znode_hold_t *zh;
 
 	/*
 	 * skip ctldir, otherwise they will always get invalidated. This will
 	 * cause funny behaviour for the mounted snapdirs. Especially for
 	 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
 	 * anyone automount it again as long as someone is still using the
 	 * detached mount.
 	 */
 	if (zp->z_is_ctldir)
 		return (0);
 
 	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
 
 	mutex_enter(&zp->z_acl_lock);
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 	mutex_exit(&zp->z_acl_lock);
 
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	if (zp->z_xattr_cached) {
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
 	rw_exit(&zp->z_xattr_lock);
 
 	ASSERT(zp->z_sa_hdl == NULL);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
 
 	/* reload cached values */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 	    &gen, sizeof (gen));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, sizeof (zp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &links, sizeof (links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &z_uid, sizeof (z_uid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &z_gid, sizeof (z_gid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 	    &mode, sizeof (mode));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 	    &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 	    &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
 		zfs_znode_dmu_fini(zp);
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (SET_ERROR(EIO));
 	}
 
 	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
 		    &projid, 8);
 		if (err != 0 && err != ENOENT) {
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_hold_exit(zfsvfs, zh);
 			return (SET_ERROR(err));
 		}
 	}
 
 	zp->z_projid = projid;
 	zp->z_mode = ZTOI(zp)->i_mode = mode;
 	zfs_uid_write(ZTOI(zp), z_uid);
 	zfs_gid_write(ZTOI(zp), z_gid);
 
 	ZFS_TIME_DECODE(&tmp_ts, atime);
 	zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
 	ZFS_TIME_DECODE(&tmp_ts, mtime);
 	zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
 	ZFS_TIME_DECODE(&tmp_ts, ctime);
 	zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
 	ZFS_TIME_DECODE(&zp->z_btime, btime);
 
 	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
 		zfs_znode_dmu_fini(zp);
 		zfs_znode_hold_exit(zfsvfs, zh);
 		return (SET_ERROR(EIO));
 	}
 
 	set_nlink(ZTOI(zp), (uint32_t)links);
 	zfs_set_inode_flags(zp, ZTOI(zp));
 
 	zp->z_blksz = doi.doi_data_block_size;
 	zp->z_atime_dirty = B_FALSE;
 	zfs_znode_update_vfs(zp);
 
 	/*
 	 * If the file has zero links, then it has been unlinked on the send
 	 * side and it must be in the received unlinked set.
 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
 	 * stale data and to prevent automatic removal of the file in
 	 * zfs_zinactive().  The file will be removed either when it is removed
 	 * on the send side and the next incremental stream is received or
 	 * when the unlinked set gets processed.
 	 */
 	zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
 	if (zp->z_unlinked)
 		zfs_znode_dmu_fini(zp);
 
 	zfs_znode_hold_exit(zfsvfs, zh);
 
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
 	uint64_t acl_obj = zfs_external_acl(zp);
 	znode_hold_t *zh;
 
 	zh = zfs_znode_hold_enter(zfsvfs, obj);
 	if (acl_obj) {
 		VERIFY(!zp->z_is_sa);
 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
 	}
 	VERIFY(0 == dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	zfs_znode_hold_exit(zfsvfs, zh);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	uint64_t z_id = zp->z_id;
 	znode_hold_t *zh;
 
 	ASSERT(zp->z_sa_hdl);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode.
 	 */
 	zh = zfs_znode_hold_enter(zfsvfs, z_id);
 
 	mutex_enter(&zp->z_lock);
 
 	/*
 	 * If this was the last reference to a file with no links, remove
 	 * the file from the file system unless the file system is mounted
 	 * read-only.  That can happen, for example, if the file system was
 	 * originally read-write, the file was opened, then unlinked and
 	 * the file system was made read-only before the file was finally
 	 * closed.  The file will remain in the unlinked set.
 	 */
 	if (zp->z_unlinked) {
 		ASSERT(!zfsvfs->z_issnap);
 		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
 			mutex_exit(&zp->z_lock);
 			zfs_znode_hold_exit(zfsvfs, zh);
 			zfs_rmnode(zp);
 			return;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 
 	zfs_znode_hold_exit(zfsvfs, zh);
 }
 
 /*
  * Determine whether the znode's atime must be updated.  The logic mostly
  * duplicates the Linux kernel's relatime_need_update() functionality.
  * This function is only called if the underlying filesystem actually has
  * atime updates enabled.
  */
 boolean_t
 zfs_relatime_need_update(const struct inode *ip)
 {
 	inode_timespec_t now, tmp_atime, tmp_ts;
 
 	gethrestime(&now);
 	tmp_atime = zpl_inode_get_atime(ip);
 	/*
 	 * In relatime mode, only update the atime if the previous atime
 	 * is earlier than either the ctime or mtime or if at least a day
 	 * has passed since the last update of atime.
 	 */
 	tmp_ts = zpl_inode_get_mtime(ip);
 	if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
 		return (B_TRUE);
 
 	tmp_ts = zpl_inode_get_ctime(ip);
 	if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0)
 		return (B_TRUE);
 
 	if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Prepare to update znode time stamps.
  *
  *	IN:	zp	- znode requiring timestamp update
  *		flag	- ATTR_MTIME, ATTR_CTIME flags
  *
  *	OUT:	zp	- z_seq
  *		mtime	- new mtime
  *		ctime	- new ctime
  *
  *	Note: We don't update atime here, because we rely on Linux VFS to do
  *	atime updating.
  */
 void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
     uint64_t ctime[2])
 {
 	inode_timespec_t now, tmp_ts;
 
 	gethrestime(&now);
 
 	zp->z_seq++;
 
 	if (flag & ATTR_MTIME) {
 		ZFS_TIME_ENCODE(&now, mtime);
 		ZFS_TIME_DECODE(&tmp_ts, mtime);
 		zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
 		if (ZTOZSB(zp)->z_use_fuids) {
 			zp->z_pflags |= (ZFS_ARCHIVE |
 			    ZFS_AV_MODIFIED);
 		}
 	}
 
 	if (flag & ATTR_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
 		ZFS_TIME_DECODE(&tmp_ts, ctime);
 		zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
 		if (ZTOZSB(zp)->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
 }
 
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
 	    size, 0, tx);
 
 	if (error == ENOTSUP)
 		return;
 	ASSERT0(error);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
 /*
  * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
 	zfs_locked_range_t *lr;
 	uint64_t newblksz;
 	int error;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
 			/*
 			 * File's blocksize is already larger than the
 			 * "recordsize" property.  Only let it grow to
 			 * the next power of 2.
 			 */
 			ASSERT(!ISP2(zp->z_blksz));
 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
 	} else {
 		newblksz = 0;
 	}
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 
 	if (newblksz)
 		zfs_grow_blocksize(zp, newblksz, tx);
 
 	zp->z_size = end;
 
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
 	zfs_rangelock_exit(lr);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * zfs_zero_partial_page - Modeled after update_pages() but
  * with different arguments and semantics for use by zfs_freesp().
  *
  * Zeroes a piece of a single page cache entry for zp at offset
  * start and length len.
  *
  * Caller must acquire a range lock on the file for the region
  * being zeroed in order that the ARC and page cache stay in sync.
  */
 static void
 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
 {
 	struct address_space *mp = ZTOI(zp)->i_mapping;
 	struct page *pp;
 	int64_t	off;
 	void *pb;
 
 	ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
 
 	off = start & (PAGE_SIZE - 1);
 	start &= PAGE_MASK;
 
 	pp = find_lock_page(mp, start >> PAGE_SHIFT);
 	if (pp) {
 		if (mapping_writably_mapped(mp))
 			flush_dcache_page(pp);
 
 		pb = kmap(pp);
 		memset(pb + off, 0, len);
 		kunmap(pp);
 
 		if (mapping_writably_mapped(mp))
 			flush_dcache_page(pp);
 
 		mark_page_accessed(pp);
 		SetPageUptodate(pp);
 		ClearPageError(pp);
 		unlock_page(pp);
 		put_page(pp);
 	}
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	zfs_locked_range_t *lr;
 	int error;
 
 	/*
 	 * Lock the range being freed.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 
 	if (off + len > zp->z_size)
 		len = zp->z_size - off;
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
 	/*
 	 * Zero partial page cache entries.  This must be done under a
 	 * range lock in order to keep the ARC and page cache in sync.
 	 */
 	if (zn_has_cached_data(zp, off, off + len - 1)) {
 		loff_t first_page, last_page, page_len;
 		loff_t first_page_offset, last_page_offset;
 
 		/* first possible full page in hole */
 		first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		/* last page of hole */
 		last_page = (off + len) >> PAGE_SHIFT;
 
 		/* offset of first_page */
 		first_page_offset = first_page << PAGE_SHIFT;
 		/* offset of last_page */
 		last_page_offset = last_page << PAGE_SHIFT;
 
 		/* truncate whole pages */
 		if (last_page_offset > first_page_offset) {
 			truncate_inode_pages_range(ZTOI(zp)->i_mapping,
 			    first_page_offset, last_page_offset - 1);
 		}
 
 		/* truncate sub-page ranges */
 		if (first_page > last_page) {
 			/* entire punched area within a single page */
 			zfs_zero_partial_page(zp, off, len);
 		} else {
 			/* beginning of punched area at the end of a page */
 			page_len  = first_page_offset - off;
 			if (page_len > 0)
 				zfs_zero_partial_page(zp, off, page_len);
 
 			/* end of punched area at the beginning of a page */
 			page_len = off + len - last_page_offset;
 			if (page_len > 0)
 				zfs_zero_partial_page(zp, last_page_offset,
 				    page_len);
 		}
 	}
 	zfs_rangelock_exit(lr);
 
 	return (error);
 }
 
 /*
  * Truncate a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	dmu_tx_t *tx;
 	zfs_locked_range_t *lr;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
 	    DMU_OBJECT_END);
 	if (error) {
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 
 	zp->z_size = end;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &zp->z_size, sizeof (zp->z_size));
 
 	if (end == 0) {
 		zp->z_pflags &= ~ZFS_SPARSE;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, 8);
 	}
 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
 	dmu_tx_commit(tx);
 	zfs_rangelock_exit(lr);
 
 	return (0);
 }
 
 /*
  * Free space in a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of range
  *		len	- end of range (0 => EOF)
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
  *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	zilog_t *zilog = zfsvfs->z_log;
 	uint64_t mode;
 	uint64_t mtime[2], ctime[2];
 	sa_bulk_attr_t bulk[3];
 	int count = 0;
 	int error;
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
 	    sizeof (mode))) != 0)
 		return (error);
 
 	if (off > zp->z_size) {
 		error =  zfs_extend(zp, off+len);
 		if (error == 0 && log)
 			goto log;
 		goto out;
 	}
 
 	if (len == 0) {
 		error = zfs_trunc(zp, off);
 	} else {
 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
 		    off + len > zp->z_size)
 			error = zfs_extend(zp, off+len);
 	}
 	if (error || !log)
 		goto out;
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &zp->z_pflags, 8);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT(error == 0);
 
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
 	dmu_tx_commit(tx);
 
 	zfs_znode_update_vfs(zp);
 	error = 0;
 
 out:
 	/*
 	 * Truncate the page cache - for file truncate operations, use
 	 * the purpose-built API for truncations.  For punching operations,
 	 * the truncation is handled under a range lock in zfs_free_range.
 	 */
 	if (len == 0)
 		truncate_setsize(ZTOI(zp), off);
 	return (error);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	struct super_block *sb;
 	zfsvfs_t	*zfsvfs;
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		size;
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Set starting attributes.
 	 */
 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
 		uint64_t val;
 		const char *name;
 
 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
 			if (val < version)
 				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
 		ASSERT(error == 0);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
 			norm = val;
 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
 			sense = val;
 	}
 	ASSERT(version != 0);
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create zap object used for SA attribute registration
 	 */
 
 	if (version >= ZPL_VERSION_SA) {
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT(error == 0);
 	} else {
 		sa_obj = 0;
 	}
 	/*
 	 * Create a delete queue.
 	 */
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
 	 * to allow zfs_mknode to work.
 	 */
 	vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	rootzp->z_unlinked = B_FALSE;
 	rootzp->z_atime_dirty = B_FALSE;
 	rootzp->z_is_sa = USE_SA(version, os);
 	rootzp->z_pflags = 0;
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_os = os;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_version = version;
 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
 	zfsvfs->z_use_sa = USE_SA(version, os);
 	zfsvfs->z_norm = norm;
 
 	sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
 	sb->s_fs_info = zfsvfs;
 
 	ZTOI(rootzp)->i_sb = sb;
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 
 	ASSERT(error == 0);
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
 	zfsvfs->z_hold_size = size;
 	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
 	    KM_SLEEP);
 	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
 	for (i = 0; i != size; i++) {
 		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
 		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
 		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
 	}
 
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids, zfs_init_idmap));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
 	zfs_acl_ids_free(&acl_ids);
 
 	atomic_set(&ZTOI(rootzp)->i_count, 0);
 	sa_handle_destroy(rootzp->z_sa_hdl);
 	kmem_cache_free(znode_cache, rootzp);
 
 	for (i = 0; i != size; i++) {
 		avl_destroy(&zfsvfs->z_hold_trees[i]);
 		mutex_destroy(&zfsvfs->z_hold_locks[i]);
 	}
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 
 	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
 	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
 	kmem_free(sb, sizeof (struct super_block));
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 EXPORT_SYMBOL(zfs_create_fs);
 EXPORT_SYMBOL(zfs_obj_to_path);
 
-/* CSTYLED */
 module_param(zfs_object_mutex_size, uint, 0644);
 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
 module_param(zfs_unlink_suspend_progress, int, 0644);
 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
 "(debug - leaks space into the unlinked set)");
diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c
index 21f3740f6fe6..22eeef7f0743 100644
--- a/module/os/linux/zfs/zio_crypt.c
+++ b/module/os/linux/zfs/zio_crypt.c
@@ -1,2080 +1,2079 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 #include <sys/qat.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	crypto_destroy_ctx_template(key->zk_hmac_tmpl);
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech = {0};
 	uint_t keydata_len;
 
 	ASSERT(key != NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 /*
  * Workaround for GCC 12+ with UBSan enabled deficencies.
  *
  * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code
  * below as violating -Warray-bounds
  */
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic pop
 #endif
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech;
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	/* destroy the old context template and create the new one */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 static int
 zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
     crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
 {
 	int ret;
 	crypto_data_t plaindata, cipherdata;
 	CK_AES_CCM_PARAMS ccmp;
 	CK_AES_GCM_PARAMS gcmp;
 	crypto_mechanism_t mech;
 	zio_crypt_info_t crypt_info;
 	uint_t plain_full_len, maclen;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	/* lookup the encryption info */
 	crypt_info = zio_crypt_table[crypt];
 
 	/* the mac will always be the last iovec_t in the cipher uio */
 	maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
 
 	ASSERT(maclen <= ZIO_DATA_MAC_LEN);
 
 	/* setup encryption mechanism (same as crypt) */
 	mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
 
 	/*
 	 * Strangely, the ICP requires that plain_full_len must include
 	 * the MAC length when decrypting, even though the UIO does not
 	 * need to have the extra space allocated.
 	 */
 	if (encrypt) {
 		plain_full_len = datalen;
 	} else {
 		plain_full_len = datalen + maclen;
 	}
 
 	/*
 	 * setup encryption params (currently only AES CCM and AES GCM
 	 * are supported)
 	 */
 	if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
 		ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
 		ccmp.ulAuthDataSize = auth_len;
 		ccmp.authData = authbuf;
 		ccmp.ulMACSize = maclen;
 		ccmp.nonce = ivbuf;
 		ccmp.ulDataSize = plain_full_len;
 
 		mech.cm_param = (char *)(&ccmp);
 		mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
 	} else {
 		gcmp.ulIvLen = ZIO_DATA_IV_LEN;
 		gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
 		gcmp.ulAADLen = auth_len;
 		gcmp.pAAD = authbuf;
 		gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
 		gcmp.pIv = ivbuf;
 
 		mech.cm_param = (char *)(&gcmp);
 		mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
 	}
 
 	/* populate the cipher and plain data structs. */
 	plaindata.cd_format = CRYPTO_DATA_UIO;
 	plaindata.cd_offset = 0;
 	plaindata.cd_uio = puio;
 	plaindata.cd_length = plain_full_len;
 
 	cipherdata.cd_format = CRYPTO_DATA_UIO;
 	cipherdata.cd_offset = 0;
 	cipherdata.cd_uio = cuio;
 	cipherdata.cd_length = datalen + maclen;
 
 	/* perform the actual encryption */
 	if (encrypt) {
 		ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata);
 		if (ret != CRYPTO_SUCCESS) {
 			ret = SET_ERROR(EIO);
 			goto error;
 		}
 	} else {
 		ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata);
 		if (ret != CRYPTO_SUCCESS) {
 			ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
 			ret = SET_ERROR(ECKSUM);
 			goto error;
 		}
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata_out;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata_out;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_iovcnt = 2;
 	puio.uio_segflg = UIO_SYSSPACE;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	crypto_mechanism_t mech;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint_t enc_len, keydata_len, aad_len;
 	int ret;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_segflg = UIO_SYSSPACE;
 	puio.uio_iovcnt = 2;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_data_t in_data, digest_data;
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	/* initialize sha512-hmac mechanism and crypto data */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	/* initialize the crypto data */
 	in_data.cd_format = CRYPTO_DATA_RAW;
 	in_data.cd_offset = 0;
 	in_data.cd_length = datalen;
 	in_data.cd_raw.iov_base = (char *)data;
 	in_data.cd_raw.iov_len = in_data.cd_length;
 
 	digest_data.cd_format = CRYPTO_DATA_RAW;
 	digest_data.cd_offset = 0;
 	digest_data.cd_length = SHA512_DIGEST_LENGTH;
 	digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
 	digest_data.cd_raw.iov_len = digest_data.cd_length;
 
 	/* generate the hmac */
 	ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
 	    &digest_data);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 
 error:
 	memset(digestbuf, 0, digestlen);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	int ret;
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 	crypto_data_t cd;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 	cd.cd_length = bab_len;
 	cd.cd_raw.iov_base = (char *)&bab;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp, tmp_dncore;
 	size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr);
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	crypto_data_t cd;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/*
 	 * Authenticate the core dnode (masking out non-portable bits).
 	 * We only copy the first 64 bytes we operate on to avoid the overhead
 	 * of copying 512-64 unneeded bytes. The compiler seems to be fine
 	 * with that.
 	 */
 	memcpy(&tmp_dncore, dnp, dn_core_size);
 	adnp = &tmp_dncore;
 
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	cd.cd_length = dn_core_size;
 	cd.cd_raw.iov_base = (char *)adnp;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_context_t ctx;
 	crypto_data_t cd;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 	/* initialize HMAC mechanism */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_portable_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_local_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (uio->uio_iov)
 		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	int ret;
 	uint64_t txtype, lr_len, nused;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	zil_chain_t *zilc;
 	lr_t *lr;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	/* cipherbuf always needs an extra iovec for the MAC */
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 	memset(dst, 0, datalen);
 
 	/* find the start and end record of the log block */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
 	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
 	ASSERT3U(nused, >=, sizeof (zil_chain_t));
 	ASSERT3U(nused, <=, datalen);
 	blkend = src + nused;
 
 	/* calculate the number of encrypted iovecs we will need */
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 		ASSERT3U(lr_len, >=, sizeof (lr_t));
 		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	/* allocate the iovec arrays */
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(dst, src, sizeof (zil_chain_t));
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	/* loop over records again, filling in iovecs */
 	nr_iovecs = 0;
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		ASSERT3P(src_iovecs, !=, NULL);
 		ASSERT3P(dst_iovecs, !=, NULL);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			const size_t o = offsetof(lr_write_t, lr_blkptr);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + o, slrp + o, sizeof (blkptr_t));
 			memcpy(aadp, slrp + o, sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			nr_iovecs++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_base =
 				    slrp + sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_len = crypt_len;
 				dst_iovecs[nr_iovecs].iov_base =
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[nr_iovecs].iov_len = crypt_len;
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 			nr_iovecs++;
 			total_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	nr_iovecs = 0;
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			ASSERT3U(nr_iovecs, <, nr_src);
 			ASSERT3U(nr_iovecs, <, nr_dst);
 			ASSERT3P(src_iovecs, !=, NULL);
 			ASSERT3P(dst_iovecs, !=, NULL);
 			src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio,
     uint_t *enc_len)
 {
 	(void) encrypt;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 
 	/* allocate the iovecs for the plain and cipher data */
 	plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!plain_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	plain_iovecs[0].iov_base = plainbuf;
 	plain_iovecs[0].iov_len = datalen;
 	cipher_iovecs[0].iov_base = cipherbuf;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	puio->uio_iov = plain_iovecs;
 	puio->uio_iovcnt = nr_plain;
 	cuio->uio_iov = cipher_iovecs;
 	cuio->uio_iovcnt = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	puio->uio_segflg = UIO_SYSSPACE;
 	cuio->uio_segflg = UIO_SYSSPACE;
 
 	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
 	mac_iov->iov_base = mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	crypto_ctx_template_t tmpl;
 	uint8_t *authbuf = NULL;
 
 	memset(&puio, 0, sizeof (puio));
 	memset(&cuio, 0, sizeof (cuio));
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = key->zk_current_tmpl;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/*
 	 * Attempt to use QAT acceleration if we can. We currently don't
 	 * do this for metadnode and ZIL blocks, since they have a much
 	 * more involved buffer layout and the qat_crypt() function only
 	 * works in-place.
 	 */
 	if (qat_crypt_use_accel(datalen) &&
 	    ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
 		uint8_t *srcbuf, *dstbuf;
 
 		if (encrypt) {
 			srcbuf = plainbuf;
 			dstbuf = cipherbuf;
 		} else {
 			srcbuf = cipherbuf;
 			dstbuf = plainbuf;
 		}
 
 		ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
 		    dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
 		if (ret == CPA_STATUS_SUCCESS) {
 			if (locked) {
 				rw_exit(&key->zk_salt_lock);
 				locked = B_FALSE;
 			}
 
 			return (0);
 		}
 		/* If the hardware implementation fails fall back to software */
 	}
 
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	/* perform the encryption / decryption in software */
 	ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
 	    &puio, &cuio, authbuf, auth_len);
 	if (ret != 0)
 		goto error;
 
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (ret);
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (ret);
 }
 
 #if defined(_KERNEL)
-/* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index f6e014327717..ff1370c543dc 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -1,1149 +1,1148 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  */
 
 
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
 #include <linux/migrate.h>
 #include <sys/file.h>
 #include <sys/dmu_objset.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_project.h>
 #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \
     defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO)
 #include <linux/pagemap.h>
 #endif
 #include <linux/fadvise.h>
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 #include <linux/writeback.h>
 #endif
 
 /*
  * When using fallocate(2) to preallocate space, inflate the requested
  * capacity check by 10% to account for the required metadata blocks.
  */
 static unsigned int zfs_fallocate_reserve_percent = 110;
 
 static int
 zpl_open(struct inode *ip, struct file *filp)
 {
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
 	error = generic_file_open(ip, filp);
 	if (error)
 		return (error);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 
 static int
 zpl_release(struct inode *ip, struct file *filp)
 {
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	if (ITOZ(ip)->z_atime_dirty)
 		zfs_mark_inode_dirty(ip);
 
 	crhold(cr);
 	error = -zfs_close(ip, filp->f_flags, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 
 static int
 zpl_iterate(struct file *filp, struct dir_context *ctx)
 {
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfs_readdir(file_inode(filp), ctx, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 
 static int
 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
 	znode_t *zp = ITOZ(inode);
 	zfsvfs_t *zfsvfs = ITOZSB(inode);
 	cred_t *cr = CRED();
 	int error;
 	fstrans_cookie_t cookie;
 
 	/*
 	 * The variables z_sync_writes_cnt and z_async_writes_cnt work in
 	 * tandem so that sync writes can detect if there are any non-sync
 	 * writes going on and vice-versa. The "vice-versa" part to this logic
 	 * is located in zfs_putpage() where non-sync writes check if there are
 	 * any ongoing sync writes. If any sync and non-sync writes overlap,
 	 * we do a commit to complete the non-sync writes since the latter can
 	 * potentially take several seconds to complete and thus block sync
 	 * writes in the upcoming call to filemap_write_and_wait_range().
 	 */
 	atomic_inc_32(&zp->z_sync_writes_cnt);
 	/*
 	 * If the following check does not detect an overlapping non-sync write
 	 * (say because it's just about to start), then it is guaranteed that
 	 * the non-sync write will detect this sync write. This is because we
 	 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing
 	 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in
 	 * zfs_putpage() respectively.
 	 */
 	if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
 		if ((error = zpl_enter(zfsvfs, FTAG)) != 0) {
 			atomic_dec_32(&zp->z_sync_writes_cnt);
 			return (error);
 		}
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		zpl_exit(zfsvfs, FTAG);
 	}
 
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
 
 	/*
 	 * The sync write is not complete yet but we decrement
 	 * z_sync_writes_cnt since zfs_fsync() increments and decrements
 	 * it internally. If a non-sync write starts just after the decrement
 	 * operation but before we call zfs_fsync(), it may not detect this
 	 * overlapping sync write but it does not matter since we have already
 	 * gone past filemap_write_and_wait_range() and we won't block due to
 	 * the non-sync write.
 	 */
 	atomic_dec_32(&zp->z_sync_writes_cnt);
 
 	if (error)
 		return (error);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfs_fsync(zp, datasync, cr);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 
 static inline int
 zfs_io_flags(struct kiocb *kiocb)
 {
 	int flags = 0;
 
 #if defined(IOCB_DSYNC)
 	if (kiocb->ki_flags & IOCB_DSYNC)
 		flags |= O_DSYNC;
 #endif
 #if defined(IOCB_SYNC)
 	if (kiocb->ki_flags & IOCB_SYNC)
 		flags |= O_SYNC;
 #endif
 #if defined(IOCB_APPEND)
 	if (kiocb->ki_flags & IOCB_APPEND)
 		flags |= O_APPEND;
 #endif
 #if defined(IOCB_DIRECT)
 	if (kiocb->ki_flags & IOCB_DIRECT)
 		flags |= O_DIRECT;
 #endif
 	return (flags);
 }
 
 /*
  * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
  * is true.  This is needed since datasets with inherited "relatime" property
  * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
  * `zfs set relatime=...`), which is what relatime test in VFS by
  * relatime_need_update() is based on.
  */
 static inline void
 zpl_file_accessed(struct file *filp)
 {
 	struct inode *ip = filp->f_mapping->host;
 
 	if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
 		if (zfs_relatime_need_update(ip))
 			file_accessed(filp);
 	} else {
 		file_accessed(filp);
 	}
 }
 
 /*
  * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports
  * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to
  * manipulate the iov_iter are available.  In which case the full iov_iter
  * can be attached to the uio and correctly handled in the lower layers.
  * Otherwise, for older kernels extract the iovec and pass it instead.
  */
 static void
 zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
     loff_t pos, ssize_t count, size_t skip)
 {
 #if defined(HAVE_VFS_IOV_ITER)
 	zfs_uio_iov_iter_init(uio, to, pos, count, skip);
 #else
 	zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
 	    zfs_uio_iov_iter_type(to) & ITER_KVEC ?
 	    UIO_SYSSPACE : UIO_USERSPACE,
 	    count, skip);
 #endif
 }
 
 static ssize_t
 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
 {
 	cred_t *cr = CRED();
 	fstrans_cookie_t cookie;
 	struct file *filp = kiocb->ki_filp;
 	ssize_t count = iov_iter_count(to);
 	zfs_uio_t uio;
 
 	zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
 	ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	if (ret < 0)
 		return (ret);
 
 	ssize_t read = count - uio.uio_resid;
 	kiocb->ki_pos += read;
 
 	zpl_file_accessed(filp);
 
 	return (read);
 }
 
 static inline ssize_t
 zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
     size_t *countp)
 {
 	ssize_t ret = generic_write_checks(kiocb, from);
 	if (ret <= 0)
 		return (ret);
 
 	*countp = ret;
 
 	return (0);
 }
 
 static ssize_t
 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
 {
 	cred_t *cr = CRED();
 	fstrans_cookie_t cookie;
 	struct file *filp = kiocb->ki_filp;
 	struct inode *ip = filp->f_mapping->host;
 	zfs_uio_t uio;
 	size_t count = 0;
 	ssize_t ret;
 
 	ret = zpl_generic_write_checks(kiocb, from, &count);
 	if (ret)
 		return (ret);
 
 	zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 
 	ret = -zfs_write(ITOZ(ip), &uio,
 	    filp->f_flags | zfs_io_flags(kiocb), cr);
 
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	if (ret < 0)
 		return (ret);
 
 	ssize_t wrote = count - uio.uio_resid;
 	kiocb->ki_pos += wrote;
 
 	return (wrote);
 }
 
 static ssize_t
 zpl_direct_IO_impl(void)
 {
 	/*
 	 * All O_DIRECT requests should be handled by
 	 * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code
 	 * should call the direct_IO address_space_operations function. We set
 	 * this code path to be fatal if it is executed.
 	 */
 	PANIC(0);
 	return (0);
 }
 
 #if defined(HAVE_VFS_DIRECT_IO_ITER)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
 {
 	return (zpl_direct_IO_impl());
 }
 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
 static ssize_t
 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
 {
 	return (zpl_direct_IO_impl());
 }
 #else
 #error "Unknown Direct I/O interface"
 #endif
 
 static loff_t
 zpl_llseek(struct file *filp, loff_t offset, int whence)
 {
 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 	fstrans_cookie_t cookie;
 
 	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
 		struct inode *ip = filp->f_mapping->host;
 		loff_t maxbytes = ip->i_sb->s_maxbytes;
 		loff_t error;
 
 		spl_inode_lock_shared(ip);
 		cookie = spl_fstrans_mark();
 		error = -zfs_holey(ITOZ(ip), whence, &offset);
 		spl_fstrans_unmark(cookie);
 		if (error == 0)
 			error = lseek_execute(filp, ip, offset, maxbytes);
 		spl_inode_unlock_shared(ip);
 
 		return (error);
 	}
 #endif /* SEEK_HOLE && SEEK_DATA */
 
 	return (generic_file_llseek(filp, offset, whence));
 }
 
 /*
  * It's worth taking a moment to describe how mmap is implemented
  * for zfs because it differs considerably from other Linux filesystems.
  * However, this issue is handled the same way under OpenSolaris.
  *
  * The issue is that by design zfs bypasses the Linux page cache and
  * leaves all caching up to the ARC.  This has been shown to work
  * well for the common read(2)/write(2) case.  However, mmap(2)
  * is problem because it relies on being tightly integrated with the
  * page cache.  To handle this we cache mmap'ed files twice, once in
  * the ARC and a second time in the page cache.  The code is careful
  * to keep both copies synchronized.
  *
  * When a file with an mmap'ed region is written to using write(2)
  * both the data in the ARC and existing pages in the page cache
  * are updated.  For a read(2) data will be read first from the page
  * cache then the ARC if needed.  Neither a write(2) or read(2) will
  * will ever result in new pages being added to the page cache.
  *
  * New pages are added to the page cache only via .readpage() which
  * is called when the vfs needs to read a page off disk to back the
  * virtual memory region.  These pages may be modified without
  * notifying the ARC and will be written out periodically via
  * .writepage().  This will occur due to either a sync or the usual
  * page aging behavior.  Note because a read(2) of a mmap'ed file
  * will always check the page cache first even when the ARC is out
  * of date correct data will still be returned.
  *
  * While this implementation ensures correct behavior it does have
  * have some drawbacks.  The most obvious of which is that it
  * increases the required memory footprint when access mmap'ed
  * files.  It also adds additional complexity to the code keeping
  * both caches synchronized.
  *
  * Longer term it may be possible to cleanly resolve this wart by
  * mapping page cache pages directly on to the ARC buffers.  The
  * Linux address space operations are flexible enough to allow
  * selection of which pages back a particular index.  The trick
  * would be working out the details of which subsystem is in
  * charge, the ARC, the page cache, or both.  It may also prove
  * helpful to move the ARC buffers to a scatter-gather lists
  * rather than a vmalloc'ed region.
  */
 static int
 zpl_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct inode *ip = filp->f_mapping->host;
 	int error;
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
 	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
 	spl_fstrans_unmark(cookie);
 
 	if (error)
 		return (error);
 
 	error = generic_file_mmap(filp, vma);
 	if (error)
 		return (error);
 
 	return (error);
 }
 
 /*
  * Populate a page with data for the Linux page cache.  This function is
  * only used to support mmap(2).  There will be an identical copy of the
  * data in the ARC which is kept up to date via .write() and .writepage().
  */
 static inline int
 zpl_readpage_common(struct page *pp)
 {
 	fstrans_cookie_t cookie;
 
 	ASSERT(PageLocked(pp));
 
 	cookie = spl_fstrans_mark();
 	int error = -zfs_getpage(pp->mapping->host, pp);
 	spl_fstrans_unmark(cookie);
 
 	unlock_page(pp);
 
 	return (error);
 }
 
 #ifdef HAVE_VFS_READ_FOLIO
 static int
 zpl_read_folio(struct file *filp, struct folio *folio)
 {
 	return (zpl_readpage_common(&folio->page));
 }
 #else
 static int
 zpl_readpage(struct file *filp, struct page *pp)
 {
 	return (zpl_readpage_common(pp));
 }
 #endif
 
 static int
 zpl_readpage_filler(void *data, struct page *pp)
 {
 	return (zpl_readpage_common(pp));
 }
 
 /*
  * Populate a set of pages with data for the Linux page cache.  This
  * function will only be called for read ahead and never for demand
  * paging.  For simplicity, the code relies on read_cache_pages() to
  * correctly lock each page for IO and call zpl_readpage().
  */
 #ifdef HAVE_VFS_READPAGES
 static int
 zpl_readpages(struct file *filp, struct address_space *mapping,
     struct list_head *pages, unsigned nr_pages)
 {
 	return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL));
 }
 #else
 static void
 zpl_readahead(struct readahead_control *ractl)
 {
 	struct page *page;
 
 	while ((page = readahead_page(ractl)) != NULL) {
 		int ret;
 
 		ret = zpl_readpage_filler(NULL, page);
 		put_page(page);
 		if (ret)
 			break;
 	}
 }
 #endif
 
 static int
 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
 	int ret;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
 	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
 	return (ret);
 }
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
 	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 
 static inline int
 zpl_write_cache_pages(struct address_space *mapping,
     struct writeback_control *wbc, void *data)
 {
 	int result;
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 	result = write_cache_pages(mapping, wbc, zpl_putfolio, data);
 #else
 	result = write_cache_pages(mapping, wbc, zpl_putpage, data);
 #endif
 	return (result);
 }
 
 static int
 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	znode_t		*zp = ITOZ(mapping->host);
 	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
 	enum writeback_sync_modes sync_mode;
 	int result;
 
 	if ((result = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (result);
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		wbc->sync_mode = WB_SYNC_ALL;
 	zpl_exit(zfsvfs, FTAG);
 	sync_mode = wbc->sync_mode;
 
 	/*
 	 * We don't want to run write_cache_pages() in SYNC mode here, because
 	 * that would make putpage() wait for a single page to be committed to
 	 * disk every single time, resulting in atrocious performance. Instead
 	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
 	 * and then we commit it all in one go.
 	 */
 	boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
 	wbc->sync_mode = WB_SYNC_NONE;
 	result = zpl_write_cache_pages(mapping, wbc, &for_sync);
 	if (sync_mode != wbc->sync_mode) {
 		if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (result);
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, zp->z_id);
 		zpl_exit(zfsvfs, FTAG);
 
 		/*
 		 * We need to call write_cache_pages() again (we can't just
 		 * return after the commit) because the previous call in
 		 * non-SYNC mode does not guarantee that we got all the dirty
 		 * pages (see the implementation of write_cache_pages() for
 		 * details). That being said, this is a no-op in most cases.
 		 */
 		wbc->sync_mode = sync_mode;
 		result = zpl_write_cache_pages(mapping, wbc, &for_sync);
 	}
 	return (result);
 }
 
 /*
  * Write out dirty pages to the ARC, this function is only required to
  * support mmap(2).  Mapped pages may be dirtied by memory operations
  * which never call .write().  These dirty pages are kept in sync with
  * the ARC buffers via this hook.
  */
 static int
 zpl_writepage(struct page *pp, struct writeback_control *wbc)
 {
 	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		wbc->sync_mode = WB_SYNC_ALL;
 
 	boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);
 
 	return (zpl_putpage(pp, wbc, &for_sync));
 }
 
 /*
  * The flag combination which matches the behavior of zfs_space() is
  * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
  * flag was introduced in the 2.6.38 kernel.
  *
  * The original mode=0 (allocate space) behavior can be reasonably emulated
  * by checking if enough space exists and creating a sparse file, as real
  * persistent space reservation is not possible due to COW, snapshots, etc.
  */
 static long
 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
 {
 	cred_t *cr = CRED();
 	loff_t olen;
 	fstrans_cookie_t cookie;
 	int error = 0;
 
 	int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE;
 
 	if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0)
 		return (-EOPNOTSUPP);
 
 	if (offset < 0 || len <= 0)
 		return (-EINVAL);
 
 	spl_inode_lock(ip);
 	olen = i_size_read(ip);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	if (mode & (test_mode)) {
 		flock64_t bf;
 
 		if (mode & FALLOC_FL_KEEP_SIZE) {
 			if (offset > olen)
 				goto out_unmark;
 
 			if (offset + len > olen)
 				len = olen - offset;
 		}
 		bf.l_type = F_WRLCK;
 		bf.l_whence = SEEK_SET;
 		bf.l_start = offset;
 		bf.l_len = len;
 		bf.l_pid = 0;
 
 		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
 	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
 		unsigned int percent = zfs_fallocate_reserve_percent;
 		struct kstatfs statfs;
 
 		/* Legacy mode, disable fallocate compatibility. */
 		if (percent == 0) {
 			error = -EOPNOTSUPP;
 			goto out_unmark;
 		}
 
 		/*
 		 * Use zfs_statvfs() instead of dmu_objset_space() since it
 		 * also checks project quota limits, which are relevant here.
 		 */
 		error = zfs_statvfs(ip, &statfs);
 		if (error)
 			goto out_unmark;
 
 		/*
 		 * Shrink available space a bit to account for overhead/races.
 		 * We know the product previously fit into availbytes from
 		 * dmu_objset_space(), so the smaller product will also fit.
 		 */
 		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
 			error = -ENOSPC;
 			goto out_unmark;
 		}
 		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
 			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
 	}
 out_unmark:
 	spl_fstrans_unmark(cookie);
 	spl_inode_unlock(ip);
 
 	crfree(cr);
 
 	return (error);
 }
 
 static long
 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
 {
 	return zpl_fallocate_common(file_inode(filp),
 	    mode, offset, len);
 }
 
 static int
 zpl_ioctl_getversion(struct file *filp, void __user *arg)
 {
 	uint32_t generation = file_inode(filp)->i_generation;
 
 	return (copy_to_user(arg, &generation, sizeof (generation)));
 }
 
 static int
 zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice)
 {
 	struct inode *ip = file_inode(filp);
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	objset_t *os = zfsvfs->z_os;
 	int error = 0;
 
 	if (S_ISFIFO(ip->i_mode))
 		return (-ESPIPE);
 
 	if (offset < 0 || len < 0)
 		return (-EINVAL);
 
 	if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_WILLNEED:
 #ifdef HAVE_GENERIC_FADVISE
 		if (zn_has_cached_data(zp, offset, offset + len - 1))
 			error = generic_fadvise(filp, offset, len, advice);
 #endif
 		/*
 		 * Pass on the caller's size directly, but note that
 		 * dmu_prefetch_max will effectively cap it.  If there
 		 * really is a larger sequential access pattern, perhaps
 		 * dmu_zfetch will detect it.
 		 */
 		if (len == 0)
 			len = i_size_read(ip) - offset;
 
 		dmu_prefetch(os, zp->z_id, 0, offset, len,
 		    ZIO_PRIORITY_ASYNC_READ);
 		break;
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_DONTNEED:
 	case POSIX_FADV_NOREUSE:
 		/* ignored for now */
 		break;
 	default:
 		error = -EINVAL;
 		break;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 #define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
 #define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
 
 static uint32_t
 __zpl_ioctl_getflags(struct inode *ip)
 {
 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 	uint32_t ioctl_flags = 0;
 
 	if (zfs_flags & ZFS_IMMUTABLE)
 		ioctl_flags |= FS_IMMUTABLE_FL;
 
 	if (zfs_flags & ZFS_APPENDONLY)
 		ioctl_flags |= FS_APPEND_FL;
 
 	if (zfs_flags & ZFS_NODUMP)
 		ioctl_flags |= FS_NODUMP_FL;
 
 	if (zfs_flags & ZFS_PROJINHERIT)
 		ioctl_flags |= ZFS_PROJINHERIT_FL;
 
 	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
 }
 
 /*
  * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
  * attributes common to both Linux and Solaris are mapped.
  */
 static int
 zpl_ioctl_getflags(struct file *filp, void __user *arg)
 {
 	uint32_t flags;
 	int err;
 
 	flags = __zpl_ioctl_getflags(file_inode(filp));
 	err = copy_to_user(arg, &flags, sizeof (flags));
 
 	return (err);
 }
 
 /*
  * fchange() is a helper macro to detect if we have been asked to change a
  * flag. This is ugly, but the requirement that we do this is a consequence of
  * how the Linux file attribute interface was designed. Another consequence is
  * that concurrent modification of files suffers from a TOCTOU race. Neither
  * are things we can fix without modifying the kernel-userland interface, which
  * is outside of our jurisdiction.
  */
 
 #define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
 
 static int
 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
 {
 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 	xoptattr_t *xoap;
 
 	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
 	    ZFS_PROJINHERIT_FL))
 		return (-EOPNOTSUPP);
 
 	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
 		return (-EACCES);
 
 	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
 	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return (-EPERM);
 
 	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
 		return (-EACCES);
 
 	xva_init(xva);
 	xoap = xva_getxoptattr(xva);
 
 #define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {	\
 	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
 	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
 		XVA_SET_REQ(xva, (xflag));	\
 		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
 	}	\
 } while (0)
 
 	FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 	    xoap->xoa_immutable);
 	FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY,
 	    xoap->xoa_appendonly);
 	FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP,
 	    xoap->xoa_nodump);
 	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
 	    xoap->xoa_projinherit);
 
 #undef	FLAG_CHANGE
 
 	return (0);
 }
 
 static int
 zpl_ioctl_setflags(struct file *filp, void __user *arg)
 {
 	struct inode *ip = file_inode(filp);
 	uint32_t flags;
 	cred_t *cr = CRED();
 	xvattr_t xva;
 	int err;
 	fstrans_cookie_t cookie;
 
 	if (copy_from_user(&flags, arg, sizeof (flags)))
 		return (-EFAULT);
 
 	err = __zpl_ioctl_setflags(ip, flags, &xva);
 	if (err)
 		return (err);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	return (err);
 }
 
 static int
 zpl_ioctl_getxattr(struct file *filp, void __user *arg)
 {
 	zfsxattr_t fsx = { 0 };
 	struct inode *ip = file_inode(filp);
 	int err;
 
 	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
 	fsx.fsx_projid = ITOZ(ip)->z_projid;
 	err = copy_to_user(arg, &fsx, sizeof (fsx));
 
 	return (err);
 }
 
 static int
 zpl_ioctl_setxattr(struct file *filp, void __user *arg)
 {
 	struct inode *ip = file_inode(filp);
 	zfsxattr_t fsx;
 	cred_t *cr = CRED();
 	xvattr_t xva;
 	xoptattr_t *xoap;
 	int err;
 	fstrans_cookie_t cookie;
 
 	if (copy_from_user(&fsx, arg, sizeof (fsx)))
 		return (-EFAULT);
 
 	if (!zpl_is_valid_projid(fsx.fsx_projid))
 		return (-EINVAL);
 
 	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
 	if (err)
 		return (err);
 
 	xoap = xva_getxoptattr(&xva);
 	XVA_SET_REQ(&xva, XAT_PROJID);
 	xoap->xoa_projid = fsx.fsx_projid;
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	return (err);
 }
 
 /*
  * Expose Additional File Level Attributes of ZFS.
  */
 static int
 zpl_ioctl_getdosflags(struct file *filp, void __user *arg)
 {
 	struct inode *ip = file_inode(filp);
 	uint64_t dosflags = ITOZ(ip)->z_pflags;
 	dosflags &= ZFS_DOS_FL_USER_VISIBLE;
 	int err = copy_to_user(arg, &dosflags, sizeof (dosflags));
 
 	return (err);
 }
 
 static int
 __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva)
 {
 	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
 	xoptattr_t *xoap;
 
 	if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE))
 		return (-EOPNOTSUPP);
 
 	if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) ||
 	    fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) &&
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return (-EPERM);
 
 	if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip))
 		return (-EACCES);
 
 	xva_init(xva);
 	xoap = xva_getxoptattr(xva);
 
 #define	FLAG_CHANGE(iflag, xflag, xfield)	do {	\
 	if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) ||	\
 	    ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) {	\
 		XVA_SET_REQ(xva, (xflag));	\
 		(xfield) = ((ioctl_flags & (iflag)) != 0);	\
 	}	\
 } while (0)
 
 	FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable);
 	FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly);
 	FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump);
 	FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly);
 	FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden);
 	FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system);
 	FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive);
 	FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink);
 	FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse);
 	FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline);
 	FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse);
 
 #undef	FLAG_CHANGE
 
 	return (0);
 }
 
 /*
  * Set Additional File Level Attributes of ZFS.
  */
 static int
 zpl_ioctl_setdosflags(struct file *filp, void __user *arg)
 {
 	struct inode *ip = file_inode(filp);
 	uint64_t dosflags;
 	cred_t *cr = CRED();
 	xvattr_t xva;
 	int err;
 	fstrans_cookie_t cookie;
 
 	if (copy_from_user(&dosflags, arg, sizeof (dosflags)))
 		return (-EFAULT);
 
 	err = __zpl_ioctl_setdosflags(ip, dosflags, &xva);
 	if (err)
 		return (err);
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	return (err);
 }
 
 static long
 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case FS_IOC_GETVERSION:
 		return (zpl_ioctl_getversion(filp, (void *)arg));
 	case FS_IOC_GETFLAGS:
 		return (zpl_ioctl_getflags(filp, (void *)arg));
 	case FS_IOC_SETFLAGS:
 		return (zpl_ioctl_setflags(filp, (void *)arg));
 	case ZFS_IOC_FSGETXATTR:
 		return (zpl_ioctl_getxattr(filp, (void *)arg));
 	case ZFS_IOC_FSSETXATTR:
 		return (zpl_ioctl_setxattr(filp, (void *)arg));
 	case ZFS_IOC_GETDOSFLAGS:
 		return (zpl_ioctl_getdosflags(filp, (void *)arg));
 	case ZFS_IOC_SETDOSFLAGS:
 		return (zpl_ioctl_setdosflags(filp, (void *)arg));
 	case ZFS_IOC_COMPAT_FICLONE:
 		return (zpl_ioctl_ficlone(filp, (void *)arg));
 	case ZFS_IOC_COMPAT_FICLONERANGE:
 		return (zpl_ioctl_ficlonerange(filp, (void *)arg));
 	case ZFS_IOC_COMPAT_FIDEDUPERANGE:
 		return (zpl_ioctl_fideduperange(filp, (void *)arg));
 	default:
 		return (-ENOTTY);
 	}
 }
 
 #ifdef CONFIG_COMPAT
 static long
 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case FS_IOC32_GETVERSION:
 		cmd = FS_IOC_GETVERSION;
 		break;
 	case FS_IOC32_GETFLAGS:
 		cmd = FS_IOC_GETFLAGS;
 		break;
 	case FS_IOC32_SETFLAGS:
 		cmd = FS_IOC_SETFLAGS;
 		break;
 	default:
 		return (-ENOTTY);
 	}
 	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
 }
 #endif /* CONFIG_COMPAT */
 
 const struct address_space_operations zpl_address_space_operations = {
 #ifdef HAVE_VFS_READPAGES
 	.readpages	= zpl_readpages,
 #else
 	.readahead	= zpl_readahead,
 #endif
 #ifdef HAVE_VFS_READ_FOLIO
 	.read_folio	= zpl_read_folio,
 #else
 	.readpage	= zpl_readpage,
 #endif
 	.writepage	= zpl_writepage,
 	.writepages	= zpl_writepages,
 	.direct_IO	= zpl_direct_IO,
 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	.set_page_dirty = __set_page_dirty_nobuffers,
 #endif
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 	.dirty_folio	= filemap_dirty_folio,
 #endif
 #ifdef HAVE_VFS_MIGRATE_FOLIO
 	.migrate_folio	= migrate_folio,
 #else
 	.migratepage	= migrate_page,
 #endif
 };
 
 const struct file_operations zpl_file_operations = {
 	.open		= zpl_open,
 	.release	= zpl_release,
 	.llseek		= zpl_llseek,
 	.read_iter	= zpl_iter_read,
 	.write_iter	= zpl_iter_write,
 #ifdef HAVE_VFS_IOV_ITER
 #ifdef HAVE_COPY_SPLICE_READ
 	.splice_read	= copy_splice_read,
 #else
 	.splice_read	= generic_file_splice_read,
 #endif
 	.splice_write	= iter_file_splice_write,
 #endif
 	.mmap		= zpl_mmap,
 	.fsync		= zpl_fsync,
 	.fallocate	= zpl_fallocate,
 	.copy_file_range	= zpl_copy_file_range,
 #ifdef HAVE_VFS_CLONE_FILE_RANGE
 	.clone_file_range	= zpl_clone_file_range,
 #endif
 #ifdef HAVE_VFS_REMAP_FILE_RANGE
 	.remap_file_range	= zpl_remap_file_range,
 #endif
 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE
 	.dedupe_file_range	= zpl_dedupe_file_range,
 #endif
 	.fadvise	= zpl_fadvise,
 	.unlocked_ioctl	= zpl_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= zpl_compat_ioctl,
 #endif
 };
 
 const struct file_operations zpl_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= zpl_iterate,
 	.fsync		= zpl_fsync,
 	.unlocked_ioctl = zpl_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = zpl_compat_ioctl,
 #endif
 };
 
-/* CSTYLED */
 module_param(zfs_fallocate_reserve_percent, uint, 0644);
 MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
 	"Percentage of length to use for the available capacity check");
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 47aa6417068d..7c9aae6a66af 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1,1943 +1,1940 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
 #include <cityhash.h>
 
 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/workqueue.h>
 #include <linux/blk-mq.h>
 
 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
     struct request *rq, boolean_t force_sync);
 
 static unsigned int zvol_major = ZVOL_MAJOR;
 static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
 /*
  * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
  * to utilize more threads for small files but may affect prefetch hits.
  */
 #define	ZVOL_TASKQ_OFFSET_SHIFT 29
 
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
 
 static unsigned int zvol_threads = 0;
 static unsigned int zvol_blk_mq_threads = 0;
 static unsigned int zvol_blk_mq_actual_threads;
 static boolean_t zvol_use_blk_mq = B_FALSE;
 
 /*
  * The maximum number of volblocksize blocks to process per thread.  Typically,
  * write heavy workloads preform better with higher values here, and read
  * heavy workloads preform better with lower values, but that's not a hard
  * and fast rule.  It's basically a knob to tune between "less overhead with
  * less parallelism" and "more overhead, but more parallelism".
  *
  * '8' was chosen as a reasonable, balanced, default based off of sequential
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 
 static unsigned int zvol_num_taskqs = 0;
 
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
 #endif
 
 /*
  * Finalize our BIO or request.
  */
 static inline void
 zvol_end_io(struct bio *bio, struct request *rq, int error)
 {
 	if (bio) {
 		bio->bi_status = errno_to_bi_status(-error);
 		bio_endio(bio);
 	} else {
 		blk_mq_end_request(rq, errno_to_bi_status(error));
 	}
 }
 
 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 static unsigned int zvol_actual_blk_mq_queue_depth;
 
 struct zvol_state_os {
 	struct gendisk		*zvo_disk;	/* generic disk */
 	struct request_queue	*zvo_queue;	/* request queue */
 	dev_t			zvo_dev;	/* device id */
 
 	struct blk_mq_tag_set tag_set;
 
 	/* Set from the global 'zvol_use_blk_mq' at zvol load */
 	boolean_t use_blk_mq;
 };
 
 typedef struct zv_taskq {
 	uint_t tqs_cnt;
 	taskq_t **tqs_taskq;
 } zv_taskq_t;
 static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
 typedef struct zv_request_stack {
 	zvol_state_t	*zv;
 	struct bio	*bio;
 	struct request *rq;
 } zv_request_t;
 
 typedef struct zv_work {
 	struct request  *rq;
 	struct work_struct work;
 } zv_work_t;
 
 typedef struct zv_request_task {
 	zv_request_t zvr;
 	taskq_ent_t	ent;
 } zv_request_task_t;
 
 static zv_request_task_t *
 zv_request_task_create(zv_request_t zvr)
 {
 	zv_request_task_t *task;
 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 	taskq_init_ent(&task->ent);
 	task->zvr = zvr;
 	return (task);
 }
 
 static void
 zv_request_task_free(zv_request_task_t *task)
 {
 	kmem_free(task, sizeof (*task));
 }
 
 /*
  * This is called when a new block multiqueue request comes in.  A request
  * contains one or more BIOs.
  */
 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
     const struct blk_mq_queue_data *bd)
 {
 	struct request *rq = bd->rq;
 	zvol_state_t *zv = rq->q->queuedata;
 
 	/* Tell the kernel that we are starting to process this request */
 	blk_mq_start_request(rq);
 
 	if (blk_rq_is_passthrough(rq)) {
 		/* Skip non filesystem request */
 		blk_mq_end_request(rq, BLK_STS_IOERR);
 		return (BLK_STS_IOERR);
 	}
 
 	zvol_request_impl(zv, NULL, rq, 0);
 
 	/* Acknowledge to the kernel that we got this request */
 	return (BLK_STS_OK);
 }
 
 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
 	.queue_rq = zvol_mq_queue_rq,
 };
 
 /* Initialize our blk-mq struct */
 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
 {
 	struct zvol_state_os *zso = zv->zv_zso;
 
 	memset(&zso->tag_set, 0, sizeof (zso->tag_set));
 
 	/* Initialize tag set. */
 	zso->tag_set.ops = &zvol_blk_mq_queue_ops;
 	zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
 	zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
 	zso->tag_set.numa_node = NUMA_NO_NODE;
 	zso->tag_set.cmd_size = 0;
 
 	/*
 	 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
 	 * zvol_request_impl()
 	 */
 	zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 	zso->tag_set.driver_data = zv;
 
 	return (blk_mq_alloc_tag_set(&zso->tag_set));
 }
 
 /*
  * Given a path, return TRUE if path is a ZVOL.
  */
 boolean_t
 zvol_os_is_zvol(const char *path)
 {
 	dev_t dev = 0;
 
 	if (vdev_lookup_bdev(path, &dev) != 0)
 		return (B_FALSE);
 
 	if (MAJOR(dev) == zvol_major)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static void
 zvol_write(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	int error = 0;
 	zfs_uio_t uio;
 	zvol_state_t *zv = zvr->zv;
 	struct request_queue *q;
 	struct gendisk *disk;
 	unsigned long start_time = 0;
 	boolean_t acct = B_FALSE;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	ASSERT3P(zv->zv_zilog, !=, NULL);
 
 	q = zv->zv_zso->zvo_queue;
 	disk = zv->zv_zso->zvo_disk;
 
 	/* bio marked as FLUSH need to flush before write */
 	if (io_is_flush(bio, rq))
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	/* Some requests are just for flush and nothing else. */
 	if (io_size(bio, rq) == 0) {
 		rw_exit(&zv->zv_suspend_lock);
 		zvol_end_io(bio, rq, 0);
 		return;
 	}
 
 	zfs_uio_bvec_init(&uio, bio, rq);
 
 	ssize_t start_resid = uio.uio_resid;
 
 	/*
 	 * With use_blk_mq, accounting is done by blk_mq_start_request()
 	 * and blk_mq_end_request(), so we can skip it here.
 	 */
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct) {
 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
 			    bio);
 		}
 	}
 
 	boolean_t sync =
 	    io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
 
 	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio.uio_loffset;
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 
 		/* This will only fail for ENOSPC */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			break;
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 		if (error == 0) {
 			zvol_log_write(zv, tx, off, bytes, sync);
 		}
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_rangelock_exit(lr);
 
 	int64_t nwritten = start_resid - uio.uio_resid;
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 	task_io_account_write(nwritten);
 
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 	}
 
 	zvol_end_io(bio, rq, -error);
 }
 
 static void
 zvol_write_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_write(&task->zvr);
 	zv_request_task_free(task);
 }
 
 static void
 zvol_discard(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	zvol_state_t *zv = zvr->zv;
 	uint64_t start = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
 	uint64_t end = start + size;
 	boolean_t sync;
 	int error = 0;
 	dmu_tx_t *tx;
 	struct request_queue *q = zv->zv_zso->zvo_queue;
 	struct gendisk *disk = zv->zv_zso->zvo_disk;
 	unsigned long start_time = 0;
 	boolean_t acct = B_FALSE;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	ASSERT3P(zv->zv_zilog, !=, NULL);
 
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct) {
 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
 			    bio);
 		}
 	}
 
 	sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	if (end > zv->zv_volsize) {
 		error = SET_ERROR(EIO);
 		goto unlock;
 	}
 
 	/*
 	 * Align the request to volume block boundaries when a secure erase is
 	 * not required.  This will prevent dnode_free_range() from zeroing out
 	 * the unaligned parts which is slow (read-modify-write) and useless
 	 * since we are not freeing any space by doing so.
 	 */
 	if (!io_is_secure_erase(bio, rq)) {
 		start = P2ROUNDUP(start, zv->zv_volblocksize);
 		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
 		size = end - start;
 	}
 
 	if (start >= end)
 		goto unlock;
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    start, size, RL_WRITER);
 
 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		zvol_log_truncate(zv, tx, start, size);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset,
 		    ZVOL_OBJ, start, size);
 	}
 	zfs_rangelock_exit(lr);
 
 	if (error == 0 && sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 unlock:
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, WRITE, bio,
 		    start_time);
 	}
 
 	zvol_end_io(bio, rq, -error);
 }
 
 static void
 zvol_discard_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_discard(&task->zvr);
 	zv_request_task_free(task);
 }
 
 static void
 zvol_read(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	int error = 0;
 	zfs_uio_t uio;
 	boolean_t acct = B_FALSE;
 	zvol_state_t *zv = zvr->zv;
 	struct request_queue *q;
 	struct gendisk *disk;
 	unsigned long start_time = 0;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	zfs_uio_bvec_init(&uio, bio, rq);
 
 	q = zv->zv_zso->zvo_queue;
 	disk = zv->zv_zso->zvo_disk;
 
 	ssize_t start_resid = uio.uio_resid;
 
 	/*
 	 * When blk-mq is being used, accounting is done by
 	 * blk_mq_start_request() and blk_mq_end_request().
 	 */
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct)
 			start_time = blk_generic_start_io_acct(q, disk, READ,
 			    bio);
 	}
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    uio.uio_loffset, uio.uio_resid, RL_READER);
 
 	uint64_t volsize = zv->zv_volsize;
 
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 
 		/* don't read past the end */
 		if (bytes > volsize - uio.uio_loffset)
 			bytes = volsize - uio.uio_loffset;
 
 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 	}
 	zfs_rangelock_exit(lr);
 
 	int64_t nread = start_resid - uio.uio_resid;
 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 	task_io_account_read(nread);
 
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 	}
 
 	zvol_end_io(bio, rq, -error);
 }
 
 static void
 zvol_read_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_read(&task->zvr);
 	zv_request_task_free(task);
 }
 
 
 /*
  * Process a BIO or request
  *
  * Either 'bio' or 'rq' should be set depending on if we are processing a
  * bio or a request (both should not be set).
  *
  * force_sync:	Set to 0 to defer processing to a background taskq
  *			Set to 1 to process data synchronously
  */
 static void
 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
     boolean_t force_sync)
 {
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
 	int rw = io_data_dir(bio, rq);
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		zvol_end_io(bio, rq, -SET_ERROR(ENXIO));
 		goto out;
 	}
 
 	if (zvol_request_sync || zv->zv_threading == B_FALSE)
 		force_sync = 1;
 
 	zv_request_t zvr = {
 		.zv = zv,
 		.bio = bio,
 		.rq = rq,
 	};
 
 	if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
 		printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
 		    zv->zv_zso->zvo_disk->disk_name,
 		    (long long unsigned)offset,
 		    (long unsigned)size);
 
 		zvol_end_io(bio, rq, -SET_ERROR(EIO));
 		goto out;
 	}
 
 	zv_request_task_t *task;
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	uint_t blk_mq_hw_queue = 0;
 	uint_t tq_idx;
 	uint_t taskq_hash;
 	if (rq)
 #ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
 #else
 		blk_mq_hw_queue =
 		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
 #endif
 	taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue);
 	tq_idx = taskq_hash % ztqs->tqs_cnt;
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 			zvol_end_io(bio, rq, -SET_ERROR(EROFS));
 			goto out;
 		}
 
 		/*
 		 * Prevents the zvol from being suspended, or the ZIL being
 		 * concurrently opened.  Will be released after the i/o
 		 * completes.
 		 */
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		/*
 		 * Open a ZIL if this is the first time we have written to this
 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 		 * than zv_state_lock so that we don't need to acquire an
 		 * additional lock in this path.
 		 */
 		if (zv->zv_zilog == NULL) {
 			rw_exit(&zv->zv_suspend_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 			if (zv->zv_zilog == NULL) {
 				zv->zv_zilog = zil_open(zv->zv_objset,
 				    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 				zv->zv_flags |= ZVOL_WRITTEN_TO;
 				/* replay / destroy done in zvol_create_minor */
 				VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 				    ZIL_REPLAY_NEEDED));
 			}
 			rw_downgrade(&zv->zv_suspend_lock);
 		}
 
 		/*
 		 * We don't want this thread to be blocked waiting for i/o to
 		 * complete, so we instead wait from a taskq callback. The
 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
 		 * indirect block, or a read of a data block (if this is a
 		 * partial-block write).  We will indicate that the i/o is
 		 * complete by calling END_IO() from the taskq callback.
 		 *
 		 * This design allows the calling thread to continue and
 		 * initiate more concurrent operations by calling
 		 * zvol_request() again. There are typically only a small
 		 * number of threads available to call zvol_request() (e.g.
 		 * one per iSCSI target), so keeping the latency of
 		 * zvol_request() low is important for performance.
 		 *
 		 * The zvol_request_sync module parameter allows this
 		 * behavior to be altered, for performance evaluation
 		 * purposes.  If the callback blocks, setting
 		 * zvol_request_sync=1 will result in much worse performance.
 		 *
 		 * We can have up to zvol_threads concurrent i/o's being
 		 * processed for all zvols on the system.  This is typically
 		 * a vast improvement over the zvol_request_sync=1 behavior
 		 * of one i/o at a time per zvol.  However, an even better
 		 * design would be for zvol_request() to initiate the zio
 		 * directly, and then be notified by the zio_done callback,
 		 * which would call END_IO().  Unfortunately, the DMU/ZIL
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
 		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_discard_task, task, 0, &task->ent);
 			}
 		} else {
 			if (force_sync) {
 				zvol_write(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_write_task, task, 0, &task->ent);
 			}
 		}
 	} else {
 		/*
 		 * The SCST driver, and possibly others, may issue READ I/Os
 		 * with a length of zero bytes.  These empty I/Os contain no
 		 * data and require no additional handling.
 		 */
 		if (size == 0) {
 			zvol_end_io(bio, rq, 0);
 			goto out;
 		}
 
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		/* See comment in WRITE case above. */
 		if (force_sync) {
 			zvol_read(&zvr);
 		} else {
 			task = zv_request_task_create(zvr);
 			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 			    zvol_read_task, task, 0, &task->ent);
 		}
 	}
 
 out:
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
 static void
 zvol_submit_bio(struct bio *bio)
 #else
 static blk_qc_t
 zvol_submit_bio(struct bio *bio)
 #endif
 #else
 static MAKE_REQUEST_FN_RET
 zvol_request(struct request_queue *q, struct bio *bio)
 #endif
 {
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 #endif
 	zvol_state_t *zv = q->queuedata;
 
 	zvol_request_impl(zv, bio, NULL, 0);
 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
 	return (BLK_QC_T_NONE);
 #endif
 }
 
 static int
 #ifdef HAVE_BLK_MODE_T
 zvol_open(struct gendisk *disk, blk_mode_t flag)
 #else
 zvol_open(struct block_device *bdev, fmode_t flag)
 #endif
 {
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t drop_suspend = B_FALSE;
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
 	hrtime_t start = gethrtime();
 
 retry:
 #endif
 	rw_enter(&zvol_state_lock, RW_READER);
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
 	 * sure that either the result of zvol free code path setting
 	 * disk->private_data to NULL is observed, or zvol_os_free()
 	 * is not called on this zv because of the positive zv_open_count.
 	 */
 #ifdef HAVE_BLK_MODE_T
 	zv = disk->private_data;
 #else
 	zv = bdev->bd_disk->private_data;
 #endif
 	if (zv == NULL) {
 		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
 
 	if (unlikely(zv->zv_flags & ZVOL_REMOVING)) {
 		mutex_exit(&zv->zv_state_lock);
 		rw_exit(&zvol_state_lock);
 		return (-SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Make sure zvol is not suspended during first open
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
 			} else {
 				drop_suspend = B_TRUE;
 			}
 		} else {
 			drop_suspend = B_TRUE;
 		}
 	}
 	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	if (zv->zv_open_count == 0) {
 		boolean_t drop_namespace = B_FALSE;
 
 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 
 		/*
 		 * In all other call paths the spa_namespace_lock is taken
 		 * before the bdev->bd_mutex lock.  However, on open(2)
 		 * the __blkdev_get() function calls fops->open() with the
 		 * bdev->bd_mutex lock held.  This can result in a deadlock
 		 * when zvols from one pool are used as vdevs in another.
 		 *
 		 * To prevent a lock inversion deadlock we preemptively
 		 * take the spa_namespace_lock.  Normally the lock will not
 		 * be contended and this is safe because spa_open_common()
 		 * handles the case where the caller already holds the
 		 * spa_namespace_lock.
 		 *
 		 * When the lock cannot be aquired after multiple retries
 		 * this must be the vdev on zvol deadlock case and we have
 		 * no choice but to return an error.  For 5.12 and older
 		 * kernels returning -ERESTARTSYS will result in the
 		 * bdev->bd_mutex being dropped, then reacquired, and
 		 * fops->open() being called again.  This process can be
 		 * repeated safely until both locks are acquired.  For 5.13
 		 * and newer the -ERESTARTSYS retry logic was removed from
 		 * the kernel so the only option is to return the error for
 		 * the caller to handle it.
 		 */
 		if (!mutex_owned(&spa_namespace_lock)) {
 			if (!mutex_tryenter(&spa_namespace_lock)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
 
 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 				schedule();
 				return (-SET_ERROR(ERESTARTSYS));
 #else
 				if ((gethrtime() - start) > timeout)
 					return (-SET_ERROR(ERESTARTSYS));
 
 				schedule_timeout_interruptible(
 					MSEC_TO_TICK(10));
 				goto retry;
 #endif
 			} else {
 				drop_namespace = B_TRUE;
 			}
 		}
 
 		error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 
 		if (drop_namespace)
 			mutex_exit(&spa_namespace_lock);
 	}
 
 	if (error == 0) {
 		if ((blk_mode_is_open_write(flag)) &&
 		    (zv->zv_flags & ZVOL_RDONLY)) {
 			if (zv->zv_open_count == 0)
 				zvol_last_close(zv);
 
 			error = -SET_ERROR(EROFS);
 		} else {
 			zv->zv_open_count++;
 		}
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 
 	if (error == 0)
 #ifdef HAVE_BLK_MODE_T
 		disk_check_media_change(disk);
 #else
 		zfs_check_media_change(bdev);
 #endif
 
 	return (error);
 }
 
 static void
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 zvol_release(struct gendisk *disk)
 #else
 zvol_release(struct gendisk *disk, fmode_t unused)
 #endif
 {
 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 	(void) unused;
 #endif
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 	zv = disk->private_data;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	/*
 	 * make sure zvol is not suspended during last close
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 1) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
 			}
 		}
 	} else {
 		drop_suspend = B_FALSE;
 	}
 	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	zv->zv_open_count--;
 	if (zv->zv_open_count == 0) {
 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 		zvol_last_close(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 }
 
 static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	switch (cmd) {
 	case BLKFLSBUF:
 #ifdef HAVE_FSYNC_BDEV
 		fsync_bdev(bdev);
 #elif defined(HAVE_SYNC_BLOCKDEV)
 		sync_blockdev(bdev);
 #else
 #error "Neither fsync_bdev() nor sync_blockdev() found"
 #endif
 		invalidate_bdev(bdev);
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		if (!(zv->zv_flags & ZVOL_RDONLY))
 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 		rw_exit(&zv->zv_suspend_lock);
 		break;
 
 	case BLKZNAME:
 		mutex_enter(&zv->zv_state_lock);
 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 		mutex_exit(&zv->zv_state_lock);
 		break;
 
 	default:
 		error = -ENOTTY;
 		break;
 	}
 
 	return (SET_ERROR(error));
 }
 
 #ifdef CONFIG_COMPAT
 static int
 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned cmd, unsigned long arg)
 {
 	return (zvol_ioctl(bdev, mode, cmd, arg));
 }
 #else
 #define	zvol_compat_ioctl	NULL
 #endif
 
 static unsigned int
 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	unsigned int mask = 0;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 		zv->zv_changed = 0;
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 
 	return (mask);
 }
 
 static int
 zvol_revalidate_disk(struct gendisk *disk)
 {
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		set_capacity(zv->zv_zso->zvo_disk,
 		    zv->zv_volsize >> SECTOR_BITS);
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 
 	return (0);
 }
 
 int
 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 {
 	struct gendisk *disk = zv->zv_zso->zvo_disk;
 
 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 #elif defined(HAVE_REVALIDATE_DISK)
 	revalidate_disk(disk);
 #else
 	zvol_revalidate_disk(disk);
 #endif
 	return (0);
 }
 
 void
 zvol_os_clear_private(zvol_state_t *zv)
 {
 	/*
 	 * Cleared while holding zvol_state_lock as a writer
 	 * which will prevent zvol_open() from opening it.
 	 */
 	zv->zv_zso->zvo_disk->private_data = NULL;
 }
 
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
 static int
 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	sector_t sectors;
 
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	sectors = get_capacity(zv->zv_zso->zvo_disk);
 
 	if (sectors > 2048) {
 		geo->heads = 16;
 		geo->sectors = 63;
 	} else {
 		geo->heads = 2;
 		geo->sectors = 4;
 	}
 
 	geo->start = 0;
 	geo->cylinders = sectors / (geo->heads * geo->sectors);
 
 	return (0);
 }
 
 /*
  * Why have two separate block_device_operations structs?
  *
  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
  * can't just change submit_bio dynamically at runtime.  So just create two
  * separate structs to get around this.
  */
 static const struct block_device_operations zvol_ops_blk_mq = {
 	.open			= zvol_open,
 	.release		= zvol_release,
 	.ioctl			= zvol_ioctl,
 	.compat_ioctl		= zvol_compat_ioctl,
 	.check_events		= zvol_check_events,
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	.revalidate_disk	= zvol_revalidate_disk,
 #endif
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 };
 
 static const struct block_device_operations zvol_ops = {
 	.open			= zvol_open,
 	.release		= zvol_release,
 	.ioctl			= zvol_ioctl,
 	.compat_ioctl		= zvol_compat_ioctl,
 	.check_events		= zvol_check_events,
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	.revalidate_disk	= zvol_revalidate_disk,
 #endif
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 	.submit_bio		= zvol_submit_bio,
 #endif
 };
 
 /*
  * Since 6.9, Linux has been removing queue limit setters in favour of an
  * initial queue_limits struct applied when the device is open. Since 6.11,
  * queue_limits is being extended to allow more things to be applied when the
  * device is open. Setters are also being removed for this.
  *
  * For OpenZFS, this means that depending on kernel version, some options may
  * be set up before the device is open, and some applied to an open device
  * (queue) after the fact.
  *
  * We manage this complexity by having our own limits struct,
  * zvol_queue_limits_t, in which we carry any queue config that we're
  * interested in setting. This structure is the same on all kernels.
  *
  * These limits are then applied to the queue at device open time by the most
  * appropriate method for the kernel.
  *
  * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
  * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
  * struct queue_limits, and passes it in. Any fields added in later kernels are
  * (obviously) not set up here.
  *
  * zvol_queue_limits_apply() is called on all kernel versions after the queue
  * is created, and applies any remaining config. Before 6.9 that will be
  * everything, via setter methods. After 6.9 that will be whatever couldn't be
  * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
  * will always be a no-op on the latest kernel we support).
  */
 typedef struct zvol_queue_limits {
 	unsigned int	zql_max_hw_sectors;
 	unsigned short	zql_max_segments;
 	unsigned int	zql_max_segment_size;
 	unsigned int	zql_io_opt;
 	unsigned int	zql_physical_block_size;
 	unsigned int	zql_max_discard_sectors;
 	unsigned int	zql_discard_granularity;
 } zvol_queue_limits_t;
 
 static void
 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
     boolean_t use_blk_mq)
 {
 	limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
 
 	if (use_blk_mq) {
 		/*
 		 * IO requests can be really big (1MB).  When an IO request
 		 * comes in, it is passed off to zvol_read() or zvol_write()
 		 * in a new thread, where it is chunked up into 'volblocksize'
 		 * sized pieces and processed.  So for example, if the request
 		 * is a 1MB write and your volblocksize is 128k, one zvol_write
 		 * thread will take that request and sequentially do ten 128k
 		 * IOs.  This is due to the fact that the thread needs to lock
 		 * each volblocksize sized block.  So you might be wondering:
 		 * "instead of passing the whole 1MB request to one thread,
 		 * why not pass ten individual 128k chunks to ten threads and
 		 * process the whole write in parallel?"  The short answer is
 		 * that there's a sweet spot number of chunks that balances
 		 * the greater parallelism with the added overhead of more
 		 * threads. The sweet spot can be different depending on if you
 		 * have a read or write  heavy workload.  Writes typically want
 		 * high chunk counts while reads typically want lower ones.  On
 		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
 		 * configuration, with volblocksize=8k, the sweet spot for good
 		 * sequential reads and writes was at 8 chunks.
 		 */
 
 		/*
 		 * Below we tell the kernel how big we want our requests
 		 * to be.  You would think that blk_queue_io_opt() would be
 		 * used to do this since it is used to "set optimal request
 		 * size for the queue", but that doesn't seem to do
 		 * anything - the kernel still gives you huge requests
 		 * with tons of little PAGE_SIZE segments contained within it.
 		 *
 		 * Knowing that the kernel will just give you PAGE_SIZE segments
 		 * no matter what, you can say "ok, I want PAGE_SIZE byte
 		 * segments, and I want 'N' of them per request", where N is
 		 * the correct number of segments for the volblocksize and
 		 * number of chunks you want.
 		 */
 		if (zvol_blk_mq_blocks_per_thread != 0) {
 			unsigned int chunks;
 			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
 
 			limits->zql_max_segment_size = PAGE_SIZE;
 			limits->zql_max_segments =
 			    (zv->zv_volblocksize * chunks) / PAGE_SIZE;
 		} else {
 			/*
 			 * Special case: zvol_blk_mq_blocks_per_thread = 0
 			 * Max everything out.
 			 */
 			limits->zql_max_segments = UINT16_MAX;
 			limits->zql_max_segment_size = UINT_MAX;
 		}
 	} else {
 		limits->zql_max_segments = UINT16_MAX;
 		limits->zql_max_segment_size = UINT_MAX;
 	}
 
 	limits->zql_io_opt = DMU_MAX_ACCESS / 2;
 
 	limits->zql_physical_block_size = zv->zv_volblocksize;
 	limits->zql_max_discard_sectors =
 	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
 	limits->zql_discard_granularity = zv->zv_volblocksize;
 }
 
 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
 static void
 zvol_queue_limits_convert(zvol_queue_limits_t *limits,
     struct queue_limits *qlimits)
 {
 	memset(qlimits, 0, sizeof (struct queue_limits));
 	qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
 	qlimits->physical_block_size = limits->zql_physical_block_size;
 	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
 	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
 	qlimits->discard_granularity = limits->zql_discard_granularity;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	qlimits->features =
 	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
 #endif
 }
 #endif
 
 static void
 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
     struct request_queue *queue)
 {
 #ifndef HAVE_BLK_ALLOC_DISK_2ARG
 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
 	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
 	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
 	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
 #endif
 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	blk_queue_set_write_cache(queue, B_TRUE);
 	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
 #endif
 }
 
 static int
 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 {
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
 #if defined(HAVE_BLK_ALLOC_DISK)
 	zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
 	if (zso->zvo_disk == NULL)
 		return (1);
 
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		return (1);
 	}
 
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
 		return (1);
 
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		return (1);
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
 #endif /* HAVE_BLK_ALLOC_DISK */
 #else
 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
 		return (1);
 
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		return (1);
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 
 	return (0);
 
 }
 
 static int
 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 {
 	struct zvol_state_os *zso = zv->zv_zso;
 
 	/* Allocate our blk-mq tag_set */
 	if (zvol_blk_mq_alloc_tag_set(zv) != 0)
 		return (1);
 
 #if defined(HAVE_BLK_ALLOC_DISK)
 	zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
 	if (zso->zvo_disk == NULL) {
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 
 	zso->zvo_disk = disk;
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #else
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 	/* Allocate queue */
 	zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
 	if (IS_ERR(zso->zvo_queue)) {
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 
 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
 #endif
 
 	zvol_queue_limits_apply(limits, zso->zvo_queue);
 
 	return (0);
 }
 
 /*
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
 static zvol_state_t *
 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
 	uint64_t volmode;
 	int ret;
 
 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
 		return (NULL);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
 		return (NULL);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
 	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL);
 
 	zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
 
 	zvol_queue_limits_t limits;
 	zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
 
 	/*
 	 * The block layer has 3 interfaces for getting BIOs:
 	 *
 	 * 1. blk-mq request queues (new)
 	 * 2. submit_bio() (oldest)
 	 * 3. regular request queues (old).
 	 *
 	 * Each of those interfaces has two permutations:
 	 *
 	 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
 	 *    both the disk and its queue (5.14 kernel or newer)
 	 *
 	 * b) We don't have blk_*alloc_disk(), and have to allocate the
 	 *    disk and the queue separately. (5.13 kernel or older)
 	 */
 	if (zv->zv_zso->use_blk_mq) {
 		ret = zvol_alloc_blk_mq(zv, &limits);
 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
 	} else {
 		ret = zvol_alloc_non_blk_mq(zso, &limits);
 		zso->zvo_disk->fops = &zvol_ops;
 	}
 	if (ret != 0)
 		goto out_kmem;
 
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
 
 	if (!zv->zv_zso->use_blk_mq) {
 		/* Disable write merging in favor of the ZIO pipeline. */
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 	}
 
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;
 	strlcpy(zv->zv_name, name, sizeof (zv->zv_name));
 
 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 
 	zso->zvo_disk->major = zvol_major;
 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
 
 	/*
 	 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
 	 * This is accomplished by limiting the number of minors for the
 	 * device to one and explicitly disabling partition scanning.
 	 */
 	if (volmode == ZFS_VOLMODE_DEV) {
 		zso->zvo_disk->minors = 1;
 		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
 		zso->zvo_disk->flags |= GENHD_FL_NO_PART;
 	}
 
 	zso->zvo_disk->first_minor = (dev & MINORMASK);
 	zso->zvo_disk->private_data = zv;
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
 	return (zv);
 
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 	return (NULL);
 }
 
 /*
  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
  * At this time, the structure is not opened by anyone, is taken off
  * the zvol_state_list, and has its private data set to NULL.
  * The zvol_state_lock is dropped.
  *
  * This function may take many milliseconds to complete (e.g. we've seen
  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
  * "del_gendisk". Thus, consumers need to be careful to account for this
  * latency when calling this function.
  */
 void
 zvol_os_free(zvol_state_t *zv)
 {
 
 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
 	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
 
 	rw_destroy(&zv->zv_suspend_lock);
 	zfs_rangelock_fini(&zv->zv_rangelock);
 
 	del_gendisk(zv->zv_zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
 #else
 	put_disk(zv->zv_zso->zvo_disk);
 #endif
 #else
 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
 	put_disk(zv->zv_zso->zvo_disk);
 #endif
 
 	if (zv->zv_zso->use_blk_mq)
 		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
 
 	ida_simple_remove(&zvol_ida,
 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
 
 	cv_destroy(&zv->zv_removing_cv);
 	mutex_destroy(&zv->zv_state_lock);
 	dataset_kstats_destroy(&zv->zv_kstat);
 
 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
 void
 zvol_wait_close(zvol_state_t *zv)
 {
 }
 
 struct add_disk_work {
 	struct delayed_work work;
 	struct gendisk *disk;
 	int error;
 };
 
 static int
 __zvol_os_add_disk(struct gendisk *disk)
 {
 	int error = 0;
 #ifdef HAVE_ADD_DISK_RET
 	error = add_disk(disk);
 #else
 	add_disk(disk);
 #endif
 	return (error);
 }
 
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 static void
 zvol_os_add_disk_work(struct work_struct *work)
 {
 	struct add_disk_work *add_disk_work;
 	add_disk_work = container_of(work, struct add_disk_work, work.work);
 	add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
 }
 #endif
 
 /*
  * SPECIAL CASE:
  *
  * This function basically calls add_disk() from a workqueue.   You may be
  * thinking: why not just call add_disk() directly?
  *
  * When you call add_disk(), the zvol appears to the world.  When this happens,
  * the kernel calls disk_scan_partitions() on the zvol, which behaves
  * differently on the 6.9+ kernels:
  *
  * - 6.8 and older kernels -
  * disk_scan_partitions()
  *	handle = bdev_open_by_dev(
  *		zvol_open()
  *	bdev_release(handle);
  *		zvol_release()
  *
  *
  * - 6.9+ kernels -
  * disk_scan_partitions()
  * 	file = bdev_file_open_by_dev()
  *		zvol_open()
  *	fput(file)
  *	< wait for return to userspace >
  *		zvol_release()
  *
  * The difference is that the bdev_release() from the 6.8 kernel is synchronous
  * while the fput() from the 6.9 kernel is async.  Or more specifically it's
  * async that has to wait until we return to userspace (since it adds the fput
  * into the caller's work queue with the TWA_RESUME flag set).  This is not the
  * behavior we want, since we want do things like create+destroy a zvol within
  * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
  * reference to the zvol while we're in the IOCTL, which can't wait until we
  * return to userspace.
  *
  * We can get around this since fput() has a special codepath for when it's
  * running in a kernel thread or interrupt.  In those cases, it just puts the
  * fput into the system workqueue, which we can force to run with
  * __flush_workqueue().  That is why we call add_disk() from a workqueue - so it
  * run from a kernel thread and "tricks" the fput() codepaths.
  *
  * Note that __flush_workqueue() is slowly getting deprecated.  This may be ok
  * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
  * fput) to happen, which it eventually, naturally, will from the system_wq
  * without us explicitly calling __flush_workqueue().
  */
 static int
 zvol_os_add_disk(struct gendisk *disk)
 {
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)	/* 6.9+ kernel */
 	struct add_disk_work add_disk_work;
 
 	INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
 	add_disk_work.disk = disk;
 	add_disk_work.error = 0;
 
 	/* Use *_delayed_work functions since they're not GPL'd */
 	schedule_delayed_work(&add_disk_work.work, 0);
 	flush_delayed_work(&add_disk_work.work);
 
 	__flush_workqueue(system_wq);
 	return (add_disk_work.error);
 #else	/* <= 6.8 kernel */
 	return (__zvol_os_add_disk(disk));
 #endif
 }
 
 /*
  * Create a block device minor node and setup the linkage between it
  * and the specified volume.  Once this function returns the block
  * device is live and ready for use.
  */
 int
 zvol_os_create_minor(const char *name)
 {
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
 	uint64_t len;
 	unsigned minor = 0;
 	int error = 0;
 	int idx;
 	uint64_t hash = zvol_name_hash(name);
 	uint64_t volthreading;
 	bool replayed_zil = B_FALSE;
 
 	if (zvol_inhibit_dev)
 		return (0);
 
 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
 	if (MINOR(minor) != minor) {
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
 		ida_simple_remove(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
 		ida_simple_remove(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 
 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
 	if (error)
 		goto out_doi;
 
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 	if (error)
 		goto out_dmu_objset_disown;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		goto out_dmu_objset_disown;
 
 	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
 	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
 	}
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	/* Default */
 	zv->zv_threading = B_TRUE;
 	if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL)
 	    == 0)
 		zv->zv_threading = volthreading;
 
 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
 
 #ifdef QUEUE_FLAG_DISCARD
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
 #endif
 #ifdef QUEUE_FLAG_NONROT
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
 #endif
 #ifdef QUEUE_FLAG_ADD_RANDOM
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
 #endif
 	/* This flag was introduced in kernel version 4.12. */
 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
 #endif
 
 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 	if (error)
 		goto out_dmu_objset_disown;
 	ASSERT3P(zv->zv_zilog, ==, NULL);
 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
 		else
 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
 	}
 	if (replayed_zil)
 		zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
 
 	/*
 	 * When udev detects the addition of the device it will immediately
 	 * invoke blkid(8) to determine the type of content on the device.
 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
 	 * up this process.
 	 */
 	len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
 	if (len > 0) {
 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	zv->zv_objset = NULL;
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	/*
 	 * Keep in mind that once add_disk() is called, the zvol is
 	 * announced to the world, and zvol_open()/zvol_release() can
 	 * be called at any time. Incidentally, add_disk() itself calls
 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
 	 * directly as well.
 	 */
 	if (error == 0) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
 		ida_simple_remove(&zvol_ida, idx);
 	}
 
 	return (error);
 }
 
 void
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
 
 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 
 	/* move to new hashtable entry  */
 	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
 	/*
 	 * The block device's read-only state is briefly changed causing
 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
 	 * the name change and fixes the symlinks.  This does not change
 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
 	 * changes.  This would normally be done using kobject_uevent() but
 	 * that is a GPL-only symbol which is why we need this workaround.
 	 */
 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
 
 	dataset_kstats_rename(&zv->zv_kstat, newname);
 }
 
 void
 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
 {
 
 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
 }
 
 void
 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
 {
 
 	set_capacity(zv->zv_zso->zvo_disk, capacity);
 }
 
 int
 zvol_init(void)
 {
 	int error;
 
 	/*
 	 * zvol_threads is the module param the user passes in.
 	 *
 	 * zvol_actual_threads is what we use internally, since the user can
 	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
 	 */
 	static unsigned int zvol_actual_threads;
 
 	if (zvol_threads == 0) {
 		/*
 		 * See dde9380a1 for why 32 was chosen here.  This should
 		 * probably be refined to be some multiple of the number
 		 * of CPUs.
 		 */
 		zvol_actual_threads = MAX(num_online_cpus(), 32);
 	} else {
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}
 
 	/*
 	 * Use atleast 32 zvol_threads but for many core system,
 	 * prefer 6 threads per taskq, but no more taskqs
 	 * than threads in them on large systems.
 	 *
 	 *                 taskq   total
 	 * cpus    taskqs  threads threads
 	 * ------- ------- ------- -------
 	 * 1       1       32       32
 	 * 2       1       32       32
 	 * 4       1       32       32
 	 * 8       2       16       32
 	 * 16      3       11       33
 	 * 32      5       7        35
 	 * 64      8       8        64
 	 * 128     11      12       132
 	 * 256     16      16       256
 	 */
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
 	if (num_tqs == 0) {
 		num_tqs = 1 + num_online_cpus() / 6;
 		while (num_tqs * num_tqs > zvol_actual_threads)
 			num_tqs--;
 	}
 	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
 	if (per_tq_thread * num_tqs < zvol_actual_threads)
 		per_tq_thread++;
 	ztqs->tqs_cnt = num_tqs;
 	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
 		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
 
 	if (zvol_blk_mq_queue_depth == 0) {
 		zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 	} else {
 		zvol_actual_blk_mq_queue_depth =
 		    MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
 	}
 
 	if (zvol_blk_mq_threads == 0) {
 		zvol_blk_mq_actual_threads = num_online_cpus();
 	} else {
 		zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
 		    1024);
 	}
 
 	for (uint_t i = 0; i < num_tqs; i++) {
 		char name[32];
 		(void) snprintf(name, sizeof (name), "%s_tq-%u",
 		    ZVOL_DRIVER, i);
 		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
 		    maxclsyspri, per_tq_thread, INT_MAX,
 		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 		if (ztqs->tqs_taskq[i] == NULL) {
 			for (int j = i - 1; j >= 0; j--)
 				taskq_destroy(ztqs->tqs_taskq[j]);
 			unregister_blkdev(zvol_major, ZVOL_DRIVER);
 			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 			    sizeof (taskq_t *));
 			ztqs->tqs_taskq = NULL;
 			return (-ENOMEM);
 		}
 	}
 
 	zvol_init_impl();
 	ida_init(&zvol_ida);
 	return (0);
 }
 
 void
 zvol_fini(void)
 {
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
 
 	if (ztqs->tqs_taskq == NULL) {
 		ASSERT3U(ztqs->tqs_cnt, ==, 0);
 	} else {
 		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
 			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
 			taskq_destroy(ztqs->tqs_taskq[i]);
 		}
 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 		    sizeof (taskq_t *));
 		ztqs->tqs_taskq = NULL;
 	}
 
 	ida_destroy(&zvol_ida);
 }
 
-/* BEGIN CSTYLED */
 module_param(zvol_inhibit_dev, uint, 0644);
 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
 module_param(zvol_threads, uint, 0444);
 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
-    "to 0 to use all active CPUs");
+	"to 0 to use all active CPUs");
 
 module_param(zvol_request_sync, uint, 0644);
 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
 module_param(zvol_num_taskqs, uint, 0444);
 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
 
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 
 module_param(zvol_volmode, uint, 0644);
 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
 
 module_param(zvol_blk_mq_queue_depth, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
 
 module_param(zvol_use_blk_mq, uint, 0644);
 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
 
 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
-    "Process volblocksize blocks per thread");
+	"Process volblocksize blocks per thread");
 
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 module_param(zvol_open_timeout_ms, uint, 0644);
 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
 #endif
-
-/* END CSTYLED */
diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c
index 43bccea14a85..fde8ae28ef36 100644
--- a/module/zcommon/zfs_valstr.c
+++ b/module/zcommon/zfs_valstr.c
@@ -1,280 +1,274 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2024, Klara Inc.
  */
 
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/string.h>
 #include <sys/debug.h>
 #include "zfs_valstr.h"
 
 /*
  * Each bit in a bitfield has three possible string representations:
  * - single char
  * - two-char pair
  * - full name
  */
 typedef struct {
 	const char	vb_bit;
 	const char	vb_pair[2];
 	const char	*vb_name;
 } valstr_bit_t;
 
 /*
  * Emits a character for each bit in `bits`, up to the number of elements
  * in the table. Set bits get the character in vb_bit, clear bits get a
  * space. This results in all strings having the same width, for easier
  * visual comparison.
  */
 static size_t
 valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems,
     uint64_t bits, char *out, size_t outlen)
 {
 	ASSERT(out);
 	size_t n = 0;
 	for (int b = 0; b < nelems; b++) {
 		if (n == outlen)
 			break;
 		uint64_t mask = (1ULL << b);
 		out[n++] = (bits & mask) ? table[b].vb_bit : ' ';
 	}
 	if (n < outlen)
 		out[n++] = '\0';
 	return (n);
 }
 
 /*
  * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and
  * separated by a `|` character. This gives a concise representation of the
  * whole value.
  */
 static size_t
 valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems,
     uint64_t bits, char *out, size_t outlen)
 {
 	ASSERT(out);
 	size_t n = 0;
 	for (int b = 0; b < nelems; b++) {
 		ASSERT3U(n, <=, outlen);
 		if (n == outlen)
 			break;
 		uint64_t mask = (1ULL << b);
 		if (bits & mask) {
 			size_t len = (n > 0) ? 3 : 2;
 			if (n > outlen-len)
 				break;
 			if (n > 0)
 				out[n++] = '|';
 			out[n++] = table[b].vb_pair[0];
 			out[n++] = table[b].vb_pair[1];
 		}
 	}
 	if (n < outlen)
 		out[n++] = '\0';
 	return (n);
 }
 
 /*
  * Emits the full name for each bit set in `bits`, taken from vb_name, and
  * separated by a space. This unambiguously shows the entire set of bits, but
  * can get very long.
  */
 static size_t
 valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems,
     uint64_t bits, char *out, size_t outlen)
 {
 	ASSERT(out);
 	size_t n = 0;
 	for (int b = 0; b < nelems; b++) {
 		ASSERT3U(n, <=, outlen);
 		if (n == outlen)
 			break;
 		uint64_t mask = (1ULL << b);
 		if (bits & mask) {
 			size_t len = strlen(table[b].vb_name);
 			if (n > 0)
 				len++;
 			if (n > outlen-len)
 				break;
 			if (n > 0) {
 				out[n++] = ' ';
 				len--;
 			}
 			memcpy(&out[n], table[b].vb_name, len);
 			n += len;
 		}
 	}
 	if (n < outlen)
 		out[n++] = '\0';
 	return (n);
 }
 
 /*
  * Emits the name of the given enum value in the table.
  */
 static size_t
 valstr_enum_str(const char **table, const size_t nelems,
     int v, char *out, size_t outlen)
 {
 	ASSERT(out);
 	ASSERT3U(v, <, nelems);
 	if (v >= nelems)
 		return (0);
 	return (MIN(strlcpy(out, table[v], outlen), outlen));
 }
 
 /*
  * These macros create the string tables for the given name, and implement
  * the public functions described in zfs_valstr.h.
  */
 #define	_VALSTR_BITFIELD_IMPL(name, ...)				\
 static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\
 size_t									\
 zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen)	\
 {									\
 	return (valstr_bitfield_bits(valstr_ ## name ## _table,		\
 	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 }									\
 									\
 size_t									\
 zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen)	\
 {									\
 	return (valstr_bitfield_pairs(valstr_ ## name ## _table,	\
 	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 }									\
 									\
 size_t									\
 zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen)		\
 {									\
 	return (valstr_bitfield_str(valstr_ ## name ## _table,		\
 	    ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen));	\
 }									\
 
 #define	_VALSTR_ENUM_IMPL(name, ...)					\
 static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ };	\
 size_t									\
 zfs_valstr_ ## name(int v, char *out, size_t outlen)			\
 {									\
 	return (valstr_enum_str(valstr_ ## name ## _table,		\
 	    ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen));	\
 }									\
 
 
 /* String tables */
 
 /* ZIO flags: zio_flag_t, typically zio->io_flags */
-/* BEGIN CSTYLED */
 _VALSTR_BITFIELD_IMPL(zio_flag,
 	{ '.', "DA", "DONT_AGGREGATE" },
 	{ '.', "RP", "IO_REPAIR" },
 	{ '.', "SH", "SELF_HEAL" },
 	{ '.', "RS", "RESILVER" },
 	{ '.', "SC", "SCRUB" },
 	{ '.', "ST", "SCAN_THREAD" },
 	{ '.', "PH", "PHYSICAL" },
 	{ '.', "CF", "CANFAIL" },
 	{ '.', "SP", "SPECULATIVE" },
 	{ '.', "CW", "CONFIG_WRITER" },
 	{ '.', "DR", "DONT_RETRY" },
 	{ '?', "??", "[UNUSED 11]" },
 	{ '.', "ND", "NODATA" },
 	{ '.', "ID", "INDUCE_DAMAGE" },
 	{ '.', "AL", "IO_ALLOCATING" },
 	{ '.', "RE", "IO_RETRY" },
 	{ '.', "PR", "PROBE" },
 	{ '.', "TH", "TRYHARD" },
 	{ '.', "OP", "OPTIONAL" },
 	{ '.', "RD", "DIO_READ" },
 	{ '.', "DQ", "DONT_QUEUE" },
 	{ '.', "DP", "DONT_PROPAGATE" },
 	{ '.', "BY", "IO_BYPASS" },
 	{ '.', "RW", "IO_REWRITE" },
 	{ '.', "CM", "RAW_COMPRESS" },
 	{ '.', "EN", "RAW_ENCRYPT" },
 	{ '.', "GG", "GANG_CHILD" },
 	{ '.', "DD", "DDT_CHILD" },
 	{ '.', "GF", "GODFATHER" },
 	{ '.', "NP", "NOPWRITE" },
 	{ '.', "EX", "REEXECUTED" },
 	{ '.', "DG", "DELEGATED" },
 	{ '.', "DC", "DIO_CHKSUM_ERR" },
 )
-/* END CSTYLED */
 
 /*
  * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or
  *                        zio->io_pipeline.
  */
-/* BEGIN CSTYLED */
 _VALSTR_BITFIELD_IMPL(zio_stage,
 	{ 'O', "O ", "OPEN" },
 	{ 'I', "RI", "READ_BP_INIT" },
 	{ 'I', "WI", "WRITE_BP_INIT" },
 	{ 'I', "FI", "FREE_BP_INIT" },
 	{ 'A', "IA", "ISSUE_ASYNC" },
 	{ 'W', "WC", "WRITE_COMPRESS" },
 	{ 'E', "EN", "ENCRYPT" },
 	{ 'C', "CG", "CHECKSUM_GENERATE" },
 	{ 'N', "NW", "NOP_WRITE" },
 	{ 'B', "BF", "BRT_FREE" },
 	{ 'd', "dS", "DDT_READ_START" },
 	{ 'd', "dD", "DDT_READ_DONE" },
 	{ 'd', "dW", "DDT_WRITE" },
 	{ 'd', "dF", "DDT_FREE" },
 	{ 'G', "GA", "GANG_ASSEMBLE" },
 	{ 'G', "GI", "GANG_ISSUE" },
 	{ 'D', "DT", "DVA_THROTTLE" },
 	{ 'D', "DA", "DVA_ALLOCATE" },
 	{ 'D', "DF", "DVA_FREE" },
 	{ 'D', "DC", "DVA_CLAIM" },
 	{ 'R', "R ", "READY" },
 	{ 'V', "VS", "VDEV_IO_START" },
 	{ 'V', "VD", "VDEV_IO_DONE" },
 	{ 'V', "VA", "VDEV_IO_ASSESS" },
 	{ 'C', "CV", "CHECKSUM_VERIFY" },
 	{ 'C', "DC", "DIO_CHECKSUM_VERIFY" },
 	{ 'X', "X ", "DONE" },
 )
-/* END CSTYLED */
 
 /* ZIO priority: zio_priority_t, typically zio->io_priority */
-/* BEGIN CSTYLED */
 _VALSTR_ENUM_IMPL(zio_priority,
 	"SYNC_READ",
 	"SYNC_WRITE",
 	"ASYNC_READ",
 	"ASYNC_WRITE",
 	"SCRUB",
 	"REMOVAL",
 	"INITIALIZING",
 	"TRIM",
 	"REBUILD",
 	"[NUM_QUEUEABLE]",
 	"NOW",
 )
-/* END CSTYLED */
 
 #undef _VALSTR_BITFIELD_IMPL
 #undef _VALSTR_ENUM_IMPL
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 9afee4e208ec..7d94214143ea 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1,1483 +1,1481 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/kstat.h>
 #include <sys/wmsum.h>
 
 /*
  * Block Cloning design.
  *
  * Block Cloning allows to manually clone a file (or a subset of its blocks)
  * into another (or the same) file by just creating additional references to
  * the data blocks without copying the data itself. Those references are kept
  * in the Block Reference Tables (BRTs).
  *
  * In many ways this is similar to the existing deduplication, but there are
  * some important differences:
  *
  * - Deduplication is automatic and Block Cloning is not - one has to use a
  *   dedicated system call(s) to clone the given file/blocks.
  * - Deduplication keeps all data blocks in its table, even those referenced
  *   just once. Block Cloning creates an entry in its tables only when there
  *   are at least two references to the given data block. If the block was
  *   never explicitly cloned or the second to last reference was dropped,
  *   there will be neither space nor performance overhead.
  * - Deduplication needs data to work - one needs to pass real data to the
  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
  *   data, just block pointers to the data, so it is extremely fast, as we pay
  *   neither the cost of reading the data, nor the cost of writing the data -
  *   we operate exclusively on metadata.
  * - If the D (dedup) bit is not set in the block pointer, it means that
  *   the block is not in the dedup table (DDT) and we won't consult the DDT
  *   when we need to free the block. Block Cloning must be consulted on every
  *   free, because we cannot modify the source BP (eg. by setting something
  *   similar to the D bit), thus we have no hint if the block is in the
  *   Block Reference Table (BRT), so we need to look into the BRT. There is
  *   an optimization in place that allows us to eliminate the majority of BRT
  *   lookups which is described below in the "Minimizing free penalty" section.
  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
  *   64bit offset and 64bit reference counter.
  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
  *   other on disk are most likely in totally different parts of the DDT.
  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
  *   from one file should have BRT entries close to each other.
  * - Scrub will only do a single pass over a block that is referenced multiple
  *   times in the DDT. Unfortunately it is not currently (if at all) possible
  *   with Block Cloning and block referenced multiple times will be scrubbed
  *   multiple times. The new, sorted scrub should be able to eliminate
  *   duplicated reads given enough memory.
  * - Deduplication requires cryptographically strong hash as a checksum or
  *   additional data verification. Block Cloning works with any checksum
  *   algorithm or even with checksumming disabled.
  *
  * As mentioned above, the BRT entries are much smaller than the DDT entries.
  * To uniquely identify a block we just need its vdev id and offset. We also
  * need to maintain a reference counter. The vdev id will often repeat, as there
  * is a small number of top-level VDEVs and a large number of blocks stored in
  * each VDEV. We take advantage of that to reduce the BRT entry size further by
  * maintaining one BRT for each top-level VDEV, so we can then have only offset
  * and counter as the BRT entry.
  *
  * Minimizing free penalty.
  *
  * Block Cloning allows creating additional references to any existing block.
  * When we free a block there is no hint in the block pointer whether the block
  * was cloned or not, so on each free we have to check if there is a
  * corresponding entry in the BRT or not. If there is, we need to decrease
  * the reference counter. Doing BRT lookup on every free can potentially be
  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
  * This is the main problem with deduplication, so we've learned our lesson and
  * try not to repeat the same mistake here. How do we do that? We divide each
  * top-level VDEV into 16MB regions. For each region we maintain a counter that
  * is a sum of all the BRT entries that have offsets within the region. This
  * creates the entries count array of 16bit numbers for each top-level VDEV.
  * The entries count array is always kept in memory and updated on disk in the
  * same transaction group as the BRT updates to keep everything in-sync. We can
  * keep the array in memory, because it is very small. With 16MB regions and
  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
  * the region size even further in the future). Now, when we want to free
  * a block, we first consult the array. If the counter for the whole region is
  * zero, there is no need to look for the BRT entry, as there isn't one for
  * sure. If the counter for the region is greater than zero, only then we will
  * do a BRT lookup and if an entry is found we will decrease the reference
  * counter in the BRT entry and in the entry counters array.
  *
  * The entry counters array is small, but can potentially be larger for very
  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
  * array on every change. We then divide the array into 32kB block and keep
  * a bitmap of dirty blocks within a transaction group. When we sync the
  * transaction group we can only update the parts of the entry counters array
  * that were modified. Note: Keeping track of the dirty parts of the entry
  * counters array is implemented, but updating only parts of the array on disk
  * is not yet implemented - for now we will update entire array if there was
  * any change.
  *
  * The implementation tries to be economic: if BRT is not used, or no longer
  * used, there will be no entries in the MOS and no additional memory used (eg.
  * the entry counters array is only allocated if needed).
  *
  * Interaction between Deduplication and Block Cloning.
  *
  * If both functionalities are in use, we could end up with a block that is
  * referenced multiple times in both DDT and BRT. When we free one of the
  * references we couldn't tell where it belongs, so we would have to decide
  * what table takes the precedence: do we first clear DDT references or BRT
  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
  * lookup DDT entry instead and increase the counter there. No BRT entry
  * will be created for a block which has the D (dedup) bit set.
  * BRT may be more efficient for manual deduplication, but if the block is
  * already in the DDT, then creating additional BRT entry would be less
  * efficient. This clever idea was proposed by Allan Jude.
  *
  * Block Cloning across datasets.
  *
  * Block Cloning is not limited to cloning blocks within the same dataset.
  * It is possible (and very useful) to clone blocks between different datasets.
  * One use case is recovering files from snapshots. By cloning the files into
  * dataset we need no additional storage. Without Block Cloning we would need
  * additional space for those files.
  * Another interesting use case is moving the files between datasets
  * (copying the file content to the new dataset and removing the source file).
  * In that case Block Cloning will only be used briefly, because the BRT entries
  * will be removed when the source is removed.
  * Block Cloning across encrypted datasets is supported as long as both
  * datasets share the same master key (e.g. snapshots and clones)
  *
  * Block Cloning flow through ZFS layers.
  *
  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
  * blocks. As of this writing no interface is implemented that allows for block
  * cloning within a ZVOL.
  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
  * for blocking cloning.
  *
  *	ssize_t
  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
  *	                size_t len, unsigned int flags);
  *
  * Even though offsets and length represent bytes, they have to be
  * block-aligned or we will return an error so the upper layer can
  * fallback to the generic mechanism that will just copy the data.
  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
  * This function was implemented based on zfs_write(), but instead of writing
  * the given data we first read block pointers using the new dmu_read_l0_bps()
  * function from the source file. Once we have BPs from the source file we call
  * the dmu_brt_clone() function on the destination file. This function
  * allocates BPs for us. We iterate over all source BPs. If the given BP is
  * a hole or an embedded block, we just copy BP as-is. If it points to a real
  * data we place this BP on a BRT pending list using the brt_pending_add()
  * function.
  *
  * We use this pending list to keep track of all BPs that got new references
  * within this transaction group.
  *
  * Some special cases to consider and how we address them:
  * - The block we want to clone may have been created within the same
  *   transaction group that we are trying to clone. Such block has no BP
  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
  * - The block we want to clone may have been modified within the same
  *   transaction group. We return EAGAIN.
  * - A block may be cloned multiple times during one transaction group (that's
  *   why pending list is actually a tree and not an append-only list - this
  *   way we can figure out faster if this block is cloned for the first time
  *   in this txg or consecutive time).
  * - A block may be cloned and freed within the same transaction group
  *   (see dbuf_undirty()).
  * - A block may be cloned and within the same transaction group the clone
  *   can be cloned again (see dmu_read_l0_bps()).
  * - A file might have been deleted, but the caller still has a file descriptor
  *   open to this file and clones it.
  *
  * When we free a block we have an additional step in the ZIO pipeline where we
  * call the zio_brt_free() function. We then call the brt_entry_decref()
  * that loads the corresponding BRT entry (if one exists) and decreases
  * reference counter. If this is not the last reference we will stop ZIO
  * pipeline here. If this is the last reference or the block is not in the
  * BRT, we continue the pipeline and free the block as usual.
  *
  * At the beginning of spa_sync() where there can be no more block cloning,
  * but before issuing frees we call brt_pending_apply(). This function applies
  * all the new clones to the BRT table - we load BRT entries and update
  * reference counters. To sync new BRT entries to disk, we use brt_sync()
  * function. This function will sync all dirty per-top-level-vdev BRTs,
  * the entry counters arrays, etc.
  *
  * Block Cloning and ZIL.
  *
  * Every clone operation is divided into chunks (similar to write) and each
  * chunk is cloned in a separate transaction. The chunk size is determined by
  * how many BPs we can fit into a single ZIL entry.
  * Replaying clone operation is different from the regular clone operation,
  * as when we log clone operations we cannot use the source object - it may
  * reside on a different dataset, so we log BPs we want to clone.
  * The ZIL is replayed when we mount the given dataset, not when the pool is
  * imported. Taking this into account it is possible that the pool is imported
  * without mounting datasets and the source dataset is destroyed before the
  * destination dataset is mounted and its ZIL replayed.
  * To address this situation we leverage zil_claim() mechanism where ZFS will
  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
  * entries, we will bump reference counters for their BPs in the BRT.  Then
  * on mount and ZIL replay we bump the reference counters once more, while the
  * first references are dropped during ZIL destroy by zil_free_clone_range().
  * It is possible that after zil_claim() we never mount the destination, so
  * we never replay its ZIL and just destroy it.  In this case the only taken
  * references will be dropped by zil_free_clone_range(), since the cloning is
  * not going to ever take place.
  */
 
 static kmem_cache_t *brt_entry_cache;
 
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
 static int brt_zap_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
 	}								\
 } while (0)
 #else
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
 static int brt_zap_default_bs = 12;
 static int brt_zap_default_ibs = 12;
 
 static kstat_t	*brt_ksp;
 
 typedef struct brt_stats {
 	kstat_named_t brt_addref_entry_not_on_disk;
 	kstat_named_t brt_addref_entry_on_disk;
 	kstat_named_t brt_decref_entry_in_memory;
 	kstat_named_t brt_decref_entry_loaded_from_disk;
 	kstat_named_t brt_decref_entry_not_in_memory;
 	kstat_named_t brt_decref_entry_read_lost_race;
 	kstat_named_t brt_decref_entry_still_referenced;
 	kstat_named_t brt_decref_free_data_later;
 	kstat_named_t brt_decref_free_data_now;
 	kstat_named_t brt_decref_no_entry;
 } brt_stats_t;
 
 static brt_stats_t brt_stats = {
 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t brt_addref_entry_not_on_disk;
 	wmsum_t brt_addref_entry_on_disk;
 	wmsum_t brt_decref_entry_in_memory;
 	wmsum_t brt_decref_entry_loaded_from_disk;
 	wmsum_t brt_decref_entry_not_in_memory;
 	wmsum_t brt_decref_entry_read_lost_race;
 	wmsum_t brt_decref_entry_still_referenced;
 	wmsum_t brt_decref_free_data_later;
 	wmsum_t brt_decref_free_data_now;
 	wmsum_t brt_decref_no_entry;
 } brt_sums;
 
 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
 
 static int brt_entry_compare(const void *x1, const void *x2);
 static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs);
 
 static void
 brt_rlock(spa_t *spa)
 {
 	rw_enter(&spa->spa_brt_lock, RW_READER);
 }
 
 static void
 brt_wlock(spa_t *spa)
 {
 	rw_enter(&spa->spa_brt_lock, RW_WRITER);
 }
 
 static void
 brt_unlock(spa_t *spa)
 {
 	rw_exit(&spa->spa_brt_lock);
 }
 
 static uint16_t
 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (unlikely(brtvd->bv_need_byteswap)) {
 		return (BSWAP_16(brtvd->bv_entcount[idx]));
 	} else {
 		return (brtvd->bv_entcount[idx]);
 	}
 }
 
 static void
 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (unlikely(brtvd->bv_need_byteswap)) {
 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
 	} else {
 		brtvd->bv_entcount[idx] = entcnt;
 	}
 }
 
 static void
 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt < UINT16_MAX);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
 }
 
 static void
 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt > 0);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
 }
 
 #ifdef ZFS_DEBUG
 static void
 brt_vdev_dump(brt_vdev_t *brtvd)
 {
 	uint64_t idx;
 
 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
 	zfs_dbgmsg("  BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d "
 	    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu",
 	    (u_longlong_t)brtvd->bv_vdevid,
 	    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
 	    (u_longlong_t)brtvd->bv_size,
 	    (u_longlong_t)brtvd->bv_totalcount,
 	    (u_longlong_t)nblocks,
 	    (size_t)BT_SIZEOFMAP(nblocks));
 	if (brtvd->bv_totalcount > 0) {
 		zfs_dbgmsg("    entcounts:");
 		for (idx = 0; idx < brtvd->bv_size; idx++) {
 			uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx);
 			if (entcnt > 0) {
 				zfs_dbgmsg("      [%04llu] %hu",
 				    (u_longlong_t)idx, entcnt);
 			}
 		}
 	}
 	if (brtvd->bv_entcount_dirty) {
 		char *bitmap;
 
 		bitmap = kmem_alloc(nblocks + 1, KM_SLEEP);
 		for (idx = 0; idx < nblocks; idx++) {
 			bitmap[idx] =
 			    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
 		}
 		bitmap[idx] = '\0';
 		zfs_dbgmsg("    dirty: %s", bitmap);
 		kmem_free(bitmap, nblocks + 1);
 	}
 }
 #endif
 
 static brt_vdev_t *
 brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc)
 {
 	brt_vdev_t *brtvd = NULL;
 
 	brt_rlock(spa);
 	if (vdevid < spa->spa_brt_nvdevs) {
 		brtvd = spa->spa_brt_vdevs[vdevid];
 	} else if (alloc) {
 		/* New VDEV was added. */
 		brt_unlock(spa);
 		brt_wlock(spa);
 		if (vdevid >= spa->spa_brt_nvdevs)
 			brt_vdevs_expand(spa, vdevid + 1);
 		brtvd = spa->spa_brt_vdevs[vdevid];
 	}
 	brt_unlock(spa);
 	return (brtvd);
 }
 
 static void
 brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 
 	ASSERT(brtvd->bv_initiated);
 	ASSERT0(brtvd->bv_mos_brtvdev);
 	ASSERT0(brtvd->bv_mos_entries);
 
 	uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
 	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
 	VERIFY(mos_entries != 0);
 	VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd,
 	    &brtvd->bv_mos_entries_dnode));
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
 	brtvd->bv_mos_entries = mos_entries;
 	rw_exit(&brtvd->bv_mos_entries_lock);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
 
 	/*
 	 * We allocate DMU buffer to store the bv_entcount[] array.
 	 * We will keep array size (bv_size) and cummulative count for all
 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
 	 */
 	brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
 	VERIFY(brtvd->bv_mos_brtvdev != 0);
 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("Pool directory object created, object=%s", name);
 
 	spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd)
 {
 	vdev_t *vd;
 	uint16_t *entcount;
 	ulong_t *bitmap;
 	uint64_t nblocks, onblocks, size;
 
 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(spa, brtvd->bv_vdevid);
 	size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1;
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
 
 	if (!brtvd->bv_initiated) {
 		ASSERT0(brtvd->bv_size);
 		ASSERT0P(brtvd->bv_entcount);
 		ASSERT0P(brtvd->bv_bitmap);
 	} else {
 		ASSERT(brtvd->bv_size > 0);
 		ASSERT(brtvd->bv_entcount != NULL);
 		ASSERT(brtvd->bv_bitmap != NULL);
 		/*
 		 * TODO: Allow vdev shrinking. We only need to implement
 		 * shrinking the on-disk BRT VDEV object.
 		 * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
 		 *     offset, size, tx);
 		 */
 		ASSERT3U(brtvd->bv_size, <=, size);
 
 		memcpy(entcount, brtvd->bv_entcount,
 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
 		vmem_free(brtvd->bv_entcount,
 		    sizeof (entcount[0]) * brtvd->bv_size);
 		onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
 		    BT_SIZEOFMAP(onblocks)));
 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks));
 	}
 
 	brtvd->bv_size = size;
 	brtvd->bv_entcount = entcount;
 	brtvd->bv_bitmap = bitmap;
 	if (!brtvd->bv_initiated) {
 		brtvd->bv_need_byteswap = FALSE;
 		brtvd->bv_initiated = TRUE;
 		BRT_DEBUG("BRT VDEV %llu initiated.",
 		    (u_longlong_t)brtvd->bv_vdevid);
 	}
 }
 
 static int
 brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd)
 {
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 	int error;
 
 	ASSERT(!brtvd->bv_initiated);
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
 	    FTAG, &db);
 	if (error != 0)
 		return (error);
 
 	bvphys = db->db_data;
 	if (spa->spa_brt_rangesize == 0) {
 		spa->spa_brt_rangesize = bvphys->bvp_rangesize;
 	} else {
 		ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize);
 	}
 
 	brt_vdev_realloc(spa, brtvd);
 
 	/* TODO: We don't support VDEV shrinking. */
 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
 
 	/*
 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
 	 */
 	error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
 	if (error != 0)
 		return (error);
 
 	ASSERT(bvphys->bvp_mos_entries != 0);
 	VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd,
 	    &brtvd->bv_mos_entries_dnode));
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
 	rw_exit(&brtvd->bv_mos_entries_lock);
 	brtvd->bv_need_byteswap =
 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
 
 	dmu_buf_rele(db, FTAG);
 
 	BRT_DEBUG("BRT VDEV %llu loaded: mos_brtvdev=%llu, mos_entries=%llu",
 	    (u_longlong_t)brtvd->bv_vdevid,
 	    (u_longlong_t)brtvd->bv_mos_brtvdev,
 	    (u_longlong_t)brtvd->bv_mos_entries);
 	return (0);
 }
 
 static void
 brt_vdev_dealloc(brt_vdev_t *brtvd)
 {
 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
 	ASSERT(brtvd->bv_initiated);
 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
 
 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
 	brtvd->bv_entcount = NULL;
 	uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks));
 	brtvd->bv_bitmap = NULL;
 
 	brtvd->bv_size = 0;
 
 	brtvd->bv_initiated = FALSE;
 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
 }
 
 static void
 brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 	uint64_t count;
 
 	ASSERT(brtvd->bv_initiated);
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(brtvd->bv_mos_entries != 0);
 	ASSERT0(brtvd->bv_totalcount);
 	ASSERT0(brtvd->bv_usedspace);
 	ASSERT0(brtvd->bv_savedspace);
 
 	uint64_t mos_entries = brtvd->bv_mos_entries;
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER);
 	brtvd->bv_mos_entries = 0;
 	rw_exit(&brtvd->bv_mos_entries_lock);
 	dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
 	brtvd->bv_mos_entries_dnode = NULL;
 	ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count));
 	ASSERT0(count);
 	VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx));
 	BRT_DEBUG("MOS entries destroyed, object=%llu",
 	    (u_longlong_t)mos_entries);
 
 	VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
 	    tx));
 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 	brtvd->bv_mos_brtvdev = 0;
 	brtvd->bv_entcount_dirty = FALSE;
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, tx));
 	BRT_DEBUG("Pool directory object removed, object=%s", name);
 
 	brtvd->bv_meta_dirty = FALSE;
 
 	rw_enter(&brtvd->bv_lock, RW_WRITER);
 	brt_vdev_dealloc(brtvd);
 	rw_exit(&brtvd->bv_lock);
 
 	spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdevs_expand(spa_t *spa, uint64_t nvdevs)
 {
 	brt_vdev_t **vdevs;
 
 	ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock));
 	ASSERT3U(nvdevs, >=, spa->spa_brt_nvdevs);
 
 	if (nvdevs == spa->spa_brt_nvdevs)
 		return;
 
 	vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP);
 	if (spa->spa_brt_nvdevs > 0) {
 		ASSERT(spa->spa_brt_vdevs != NULL);
 
 		memcpy(vdevs, spa->spa_brt_vdevs,
 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
 		kmem_free(spa->spa_brt_vdevs,
 		    sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs);
 	}
 	spa->spa_brt_vdevs = vdevs;
 
 	for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP);
 		rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL);
 		brtvd->bv_vdevid = vdevid;
 		brtvd->bv_initiated = FALSE;
 		rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL);
 		avl_create(&brtvd->bv_tree, brt_entry_compare,
 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
 		for (int i = 0; i < TXG_SIZE; i++) {
 			avl_create(&brtvd->bv_pending_tree[i],
 			    brt_entry_compare, sizeof (brt_entry_t),
 			    offsetof(brt_entry_t, bre_node));
 		}
 		mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL);
 		spa->spa_brt_vdevs[vdevid] = brtvd;
 	}
 
 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
 	    (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs);
 	spa->spa_brt_nvdevs = nvdevs;
 }
 
 static boolean_t
 brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, uint64_t offset)
 {
 	uint64_t idx = offset / spa->spa_brt_rangesize;
 	if (idx < brtvd->bv_size) {
 		/* VDEV wasn't expanded. */
 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
 	}
 	return (FALSE);
 }
 
 static void
 brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize, uint64_t count)
 {
 	uint64_t idx;
 
 	ASSERT(brtvd->bv_initiated);
 
 	brtvd->bv_savedspace += dsize * count;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_count > 0)
 		return;
 
 	brtvd->bv_usedspace += dsize;
 
 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
 	if (idx >= brtvd->bv_size) {
 		/* VDEV has been expanded. */
 		rw_enter(&brtvd->bv_lock, RW_WRITER);
 		brt_vdev_realloc(spa, brtvd);
 		rw_exit(&brtvd->bv_lock);
 	}
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	brtvd->bv_totalcount++;
 	brt_vdev_entcount_inc(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 }
 
 static void
 brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize)
 {
 	uint64_t idx;
 
 	ASSERT(RW_WRITE_HELD(&brtvd->bv_lock));
 	ASSERT(brtvd->bv_initiated);
 
 	brtvd->bv_savedspace -= dsize;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_count > 0)
 		return;
 
 	brtvd->bv_usedspace -= dsize;
 
 	idx = BRE_OFFSET(bre) / spa->spa_brt_rangesize;
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	ASSERT(brtvd->bv_totalcount > 0);
 	brtvd->bv_totalcount--;
 	brt_vdev_entcount_dec(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 }
 
 static void
 brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 
 	ASSERT(brtvd->bv_meta_dirty);
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev,
 	    FTAG, &db));
 
 	if (brtvd->bv_entcount_dirty) {
 		/*
 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
 		 */
 		dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0,
 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
 		    brtvd->bv_entcount, tx);
 		uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size);
 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks));
 		brtvd->bv_entcount_dirty = FALSE;
 	}
 
 	dmu_buf_will_dirty(db, tx);
 	bvphys = db->db_data;
 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
 	bvphys->bvp_size = brtvd->bv_size;
 	if (brtvd->bv_need_byteswap) {
 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
 	} else {
 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
 	}
 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
 	bvphys->bvp_rangesize = spa->spa_brt_rangesize;
 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
 	dmu_buf_rele(db, FTAG);
 
 	brtvd->bv_meta_dirty = FALSE;
 }
 
 static void
 brt_vdevs_free(spa_t *spa)
 {
 	if (spa->spa_brt_vdevs == 0)
 		return;
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		rw_enter(&brtvd->bv_lock, RW_WRITER);
 		if (brtvd->bv_initiated)
 			brt_vdev_dealloc(brtvd);
 		rw_exit(&brtvd->bv_lock);
 		rw_destroy(&brtvd->bv_lock);
 		if (brtvd->bv_mos_entries != 0)
 			dnode_rele(brtvd->bv_mos_entries_dnode, brtvd);
 		rw_destroy(&brtvd->bv_mos_entries_lock);
 		avl_destroy(&brtvd->bv_tree);
 		for (int i = 0; i < TXG_SIZE; i++)
 			avl_destroy(&brtvd->bv_pending_tree[i]);
 		mutex_destroy(&brtvd->bv_pending_lock);
 		kmem_free(brtvd, sizeof (*brtvd));
 	}
 	kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) *
 	    spa->spa_brt_nvdevs);
 }
 
 static void
 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
 {
 
 	bre->bre_bp = *bp;
 	bre->bre_count = 0;
 	bre->bre_pcount = 0;
 
 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
 }
 
 static int
 brt_entry_lookup(brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t off = BRE_OFFSET(bre);
 
 	return (zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
 	    &off, BRT_KEY_WORDS, 1, sizeof (bre->bre_count), &bre->bre_count));
 }
 
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
  * may require reads and thus will be more expensive.
  */
 boolean_t
 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
 {
 
 	if (spa->spa_brt_nvdevs == 0)
 		return (B_FALSE);
 
 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
 	if (brtvd == NULL || !brtvd->bv_initiated)
 		return (FALSE);
 
 	/*
 	 * We don't need locks here, since bv_entcount pointer must be
 	 * stable at this point, and we don't care about false positive
 	 * races here, while false negative should be impossible, since
 	 * all brt_vdev_addref() have already completed by this point.
 	 */
 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
 	return (brt_vdev_lookup(spa, brtvd, off));
 }
 
 uint64_t
 brt_get_dspace(spa_t *spa)
 {
 	if (spa->spa_brt_nvdevs == 0)
 		return (0);
 
 	brt_rlock(spa);
 	uint64_t s = 0;
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
 		s += spa->spa_brt_vdevs[vdevid]->bv_savedspace;
 	brt_unlock(spa);
 	return (s);
 }
 
 uint64_t
 brt_get_used(spa_t *spa)
 {
 	if (spa->spa_brt_nvdevs == 0)
 		return (0);
 
 	brt_rlock(spa);
 	uint64_t s = 0;
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++)
 		s += spa->spa_brt_vdevs[vdevid]->bv_usedspace;
 	brt_unlock(spa);
 	return (s);
 }
 
 uint64_t
 brt_get_saved(spa_t *spa)
 {
 	return (brt_get_dspace(spa));
 }
 
 uint64_t
 brt_get_ratio(spa_t *spa)
 {
 	uint64_t used = brt_get_used(spa);
 	if (used == 0)
 		return (100);
 	return ((used + brt_get_saved(spa)) * 100 / used);
 }
 
 static int
 brt_kstats_update(kstat_t *ksp, int rw)
 {
 	brt_stats_t *bs = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	bs->brt_addref_entry_not_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
 	bs->brt_addref_entry_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
 	bs->brt_decref_entry_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
 	bs->brt_decref_entry_not_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
 	bs->brt_decref_entry_read_lost_race.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
 	bs->brt_decref_entry_still_referenced.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
 	bs->brt_decref_free_data_later.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
 	bs->brt_decref_free_data_now.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
 	bs->brt_decref_no_entry.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_no_entry);
 
 	return (0);
 }
 
 static void
 brt_stat_init(void)
 {
 
 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
 
 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (brt_ksp != NULL) {
 		brt_ksp->ks_data = &brt_stats;
 		brt_ksp->ks_update = brt_kstats_update;
 		kstat_install(brt_ksp);
 	}
 }
 
 static void
 brt_stat_fini(void)
 {
 	if (brt_ksp != NULL) {
 		kstat_delete(brt_ksp);
 		brt_ksp = NULL;
 	}
 
 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
 	wmsum_fini(&brt_sums.brt_decref_no_entry);
 }
 
 void
 brt_init(void)
 {
 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	brt_stat_init();
 }
 
 void
 brt_fini(void)
 {
 	brt_stat_fini();
 
 	kmem_cache_destroy(brt_entry_cache);
 }
 
 /* Return TRUE if block should be freed immediately. */
 boolean_t
 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
 {
 	brt_entry_t *bre, *racebre;
 	brt_entry_t bre_search;
 	avl_index_t where;
 	uint64_t vdevid;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
 	ASSERT(brtvd != NULL);
 
 	rw_enter(&brtvd->bv_lock, RW_WRITER);
 	ASSERT(brtvd->bv_initiated);
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre != NULL) {
 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
 		goto out;
 	} else {
 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
 	}
 	rw_exit(&brtvd->bv_lock);
 
 	error = brt_entry_lookup(brtvd, &bre_search);
 	/* bre_search now contains correct bre_count */
 	if (error == ENOENT) {
 		BRTSTAT_BUMP(brt_decref_no_entry);
 		return (B_TRUE);
 	}
 	ASSERT0(error);
 
 	rw_enter(&brtvd->bv_lock, RW_WRITER);
 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
 	if (racebre != NULL) {
 		/* The entry was added when the lock was dropped. */
 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
 		bre = racebre;
 		goto out;
 	}
 
 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
 	bre->bre_bp = bre_search.bre_bp;
 	bre->bre_count = bre_search.bre_count;
 	bre->bre_pcount = 0;
 	avl_insert(&brtvd->bv_tree, bre, where);
 
 out:
 	if (bre->bre_count == 0) {
 		rw_exit(&brtvd->bv_lock);
 		BRTSTAT_BUMP(brt_decref_free_data_now);
 		return (B_TRUE);
 	}
 
 	bre->bre_pcount--;
 	ASSERT(bre->bre_count > 0);
 	bre->bre_count--;
 	if (bre->bre_count == 0)
 		BRTSTAT_BUMP(brt_decref_free_data_later);
 	else
 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
 	brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp));
 
 	rw_exit(&brtvd->bv_lock);
 
 	return (B_FALSE);
 }
 
 uint64_t
 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
 {
 	brt_entry_t bre_search, *bre;
 	uint64_t vdevid, refcnt;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
 	ASSERT(brtvd != NULL);
 
 	rw_enter(&brtvd->bv_lock, RW_READER);
 	ASSERT(brtvd->bv_initiated);
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre == NULL) {
 		rw_exit(&brtvd->bv_lock);
 		error = brt_entry_lookup(brtvd, &bre_search);
 		if (error == ENOENT) {
 			refcnt = 0;
 		} else {
 			ASSERT0(error);
 			refcnt = bre_search.bre_count;
 		}
 	} else {
 		refcnt = bre->bre_count;
 		rw_exit(&brtvd->bv_lock);
 	}
 
 	return (refcnt);
 }
 
 static void
 brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp)
 {
 	if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0)
 		return;
 
 	uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[0]);
 	rw_enter(&brtvd->bv_mos_entries_lock, RW_READER);
 	if (brtvd->bv_mos_entries != 0) {
 		(void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode,
 		    &off, BRT_KEY_WORDS);
 	}
 	rw_exit(&brtvd->bv_mos_entries_lock);
 }
 
 static int
 brt_entry_compare(const void *x1, const void *x2)
 {
 	const brt_entry_t *bre1 = x1, *bre2 = x2;
 	const blkptr_t *bp1 = &bre1->bre_bp, *bp2 = &bre2->bre_bp;
 
 	return (TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
 	    DVA_GET_OFFSET(&bp2->blk_dva[0])));
 }
 
 void
 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_entry_t *bre, *newbre;
 	avl_index_t where;
 	uint64_t txg;
 
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 
 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE);
 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
 
 	newbre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
 	newbre->bre_bp = *bp;
 	newbre->bre_count = 0;
 	newbre->bre_pcount = 1;
 
 	mutex_enter(&brtvd->bv_pending_lock);
 	bre = avl_find(pending_tree, newbre, &where);
 	if (bre == NULL) {
 		avl_insert(pending_tree, newbre, where);
 		newbre = NULL;
 	} else {
 		bre->bre_pcount++;
 	}
 	mutex_exit(&brtvd->bv_pending_lock);
 
 	if (newbre != NULL) {
 		ASSERT(bre != NULL);
 		ASSERT(bre != newbre);
 		kmem_cache_free(brt_entry_cache, newbre);
 	} else {
 		ASSERT0P(bre);
 
 		/* Prefetch BRT entry for the syncing context. */
 		brt_prefetch(brtvd, bp);
 	}
 }
 
 void
 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_entry_t *bre, bre_search;
 	uint64_t txg;
 
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 
 	uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]);
 	brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE);
 	ASSERT(brtvd != NULL);
 	avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK];
 
 	bre_search.bre_bp = *bp;
 
 	mutex_enter(&brtvd->bv_pending_lock);
 	bre = avl_find(pending_tree, &bre_search, NULL);
 	ASSERT(bre != NULL);
 	ASSERT(bre->bre_pcount > 0);
 	bre->bre_pcount--;
 	if (bre->bre_pcount == 0)
 		avl_remove(pending_tree, bre);
 	else
 		bre = NULL;
 	mutex_exit(&brtvd->bv_pending_lock);
 
 	if (bre)
 		kmem_cache_free(brt_entry_cache, bre);
 }
 
 static void
 brt_pending_apply_vdev(spa_t *spa, brt_vdev_t *brtvd, uint64_t txg)
 {
 	brt_entry_t *bre, *nbre;
 
 	/*
 	 * We are in syncing context, so no other bv_pending_tree accesses
 	 * are possible for the TXG.  So we don't need bv_pending_lock.
 	 */
 	ASSERT(avl_is_empty(&brtvd->bv_tree));
 	avl_swap(&brtvd->bv_tree, &brtvd->bv_pending_tree[txg & TXG_MASK]);
 
 	for (bre = avl_first(&brtvd->bv_tree); bre; bre = nbre) {
 		nbre = AVL_NEXT(&brtvd->bv_tree, bre);
 
 		/*
 		 * If the block has DEDUP bit set, it means that it
 		 * already exists in the DEDUP table, so we can just
 		 * use that instead of creating new entry in the BRT.
 		 */
 		if (BP_GET_DEDUP(&bre->bre_bp)) {
 			while (bre->bre_pcount > 0) {
 				if (!ddt_addref(spa, &bre->bre_bp))
 					break;
 				bre->bre_pcount--;
 			}
 			if (bre->bre_pcount == 0) {
 				avl_remove(&brtvd->bv_tree, bre);
 				kmem_cache_free(brt_entry_cache, bre);
 				continue;
 			}
 		}
 
 		/*
 		 * Unless we know that the block is definitely not in ZAP,
 		 * try to get its reference count from there.
 		 */
 		uint64_t off = BRE_OFFSET(bre);
 		if (brtvd->bv_mos_entries != 0 &&
 		    brt_vdev_lookup(spa, brtvd, off)) {
 			int error = zap_lookup_uint64_by_dnode(
 			    brtvd->bv_mos_entries_dnode, &off,
 			    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
 			    &bre->bre_count);
 			if (error == 0) {
 				BRTSTAT_BUMP(brt_addref_entry_on_disk);
 			} else {
 				ASSERT3U(error, ==, ENOENT);
 				BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
 			}
 		}
 	}
 
 	/*
 	 * If all the cloned blocks we had were handled by DDT, we don't need
 	 * to initiate the vdev.
 	 */
 	if (avl_is_empty(&brtvd->bv_tree))
 		return;
 
 	if (!brtvd->bv_initiated) {
 		rw_enter(&brtvd->bv_lock, RW_WRITER);
 		brt_vdev_realloc(spa, brtvd);
 		rw_exit(&brtvd->bv_lock);
 	}
 
 	/*
 	 * Convert pending references into proper ones.  This has to be a
 	 * separate loop, since entcount modifications would cause false
 	 * positives for brt_vdev_lookup() on following iterations.
 	 */
 	for (bre = avl_first(&brtvd->bv_tree); bre;
 	    bre = AVL_NEXT(&brtvd->bv_tree, bre)) {
 		brt_vdev_addref(spa, brtvd, bre,
 		    bp_get_dsize(spa, &bre->bre_bp), bre->bre_pcount);
 		bre->bre_count += bre->bre_pcount;
 	}
 }
 
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
 
 	brt_rlock(spa);
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		brt_unlock(spa);
 
 		brt_pending_apply_vdev(spa, brtvd, txg);
 
 		brt_rlock(spa);
 	}
 	brt_unlock(spa);
 }
 
 static void
 brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
 	uint64_t off = BRE_OFFSET(bre);
 
 	if (bre->bre_pcount == 0) {
 		/* The net change is zero, nothing to do in ZAP. */
 	} else if (bre->bre_count == 0) {
 		int error = zap_remove_uint64_by_dnode(dn, &off,
 		    BRT_KEY_WORDS, tx);
 		VERIFY(error == 0 || error == ENOENT);
 	} else {
 		VERIFY0(zap_update_uint64_by_dnode(dn, &off,
 		    BRT_KEY_WORDS, 1, sizeof (bre->bre_count),
 		    &bre->bre_count, tx));
 	}
 }
 
 static void
 brt_sync_table(spa_t *spa, dmu_tx_t *tx)
 {
 	brt_entry_t *bre;
 
 	brt_rlock(spa);
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		brt_unlock(spa);
 
 		if (!brtvd->bv_meta_dirty) {
 			ASSERT(!brtvd->bv_entcount_dirty);
 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
 			brt_rlock(spa);
 			continue;
 		}
 
 		ASSERT(!brtvd->bv_entcount_dirty ||
 		    avl_numnodes(&brtvd->bv_tree) != 0);
 
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(spa, brtvd, tx);
 
 		void *c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
 			brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx);
 			kmem_cache_free(brt_entry_cache, bre);
 		}
 
 #ifdef ZFS_DEBUG
 		if (zfs_flags & ZFS_DEBUG_BRT)
 			brt_vdev_dump(brtvd);
 #endif
 		if (brtvd->bv_totalcount == 0)
 			brt_vdev_destroy(spa, brtvd, tx);
 		else
 			brt_vdev_sync(spa, brtvd, tx);
 		brt_rlock(spa);
 	}
 	brt_unlock(spa);
 }
 
 void
 brt_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	uint64_t vdevid;
 
 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	brt_rlock(spa);
 	for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty)
 			break;
 	}
 	if (vdevid >= spa->spa_brt_nvdevs) {
 		brt_unlock(spa);
 		return;
 	}
 	brt_unlock(spa);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	brt_sync_table(spa, tx);
 	dmu_tx_commit(tx);
 }
 
 static void
 brt_alloc(spa_t *spa)
 {
 	rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL);
 	spa->spa_brt_vdevs = NULL;
 	spa->spa_brt_nvdevs = 0;
 	spa->spa_brt_rangesize = 0;
 }
 
 void
 brt_create(spa_t *spa)
 {
 	brt_alloc(spa);
 	spa->spa_brt_rangesize = BRT_RANGESIZE;
 }
 
 int
 brt_load(spa_t *spa)
 {
 	int error = 0;
 
 	brt_alloc(spa);
 	brt_wlock(spa);
 	for (uint64_t vdevid = 0; vdevid < spa->spa_root_vdev->vdev_children;
 	    vdevid++) {
 		char name[64];
 		uint64_t mos_brtvdev;
 
 		/* Look if this vdev had active block cloning. */
 		snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 		    (u_longlong_t)vdevid);
 		error = zap_lookup(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
 		    &mos_brtvdev);
 		if (error == ENOENT) {
 			error = 0;
 			continue;
 		}
 		if (error != 0)
 			break;
 
 		/* If it did, then allocate them all and load this one. */
 		brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children);
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		rw_enter(&brtvd->bv_lock, RW_WRITER);
 		brtvd->bv_mos_brtvdev = mos_brtvdev;
 		error = brt_vdev_load(spa, brtvd);
 		rw_exit(&brtvd->bv_lock);
 		if (error != 0)
 			break;
 	}
 
 	if (spa->spa_brt_rangesize == 0)
 		spa->spa_brt_rangesize = BRT_RANGESIZE;
 	brt_unlock(spa);
 	return (error);
 }
 
 void
 brt_unload(spa_t *spa)
 {
 	if (spa->spa_brt_rangesize == 0)
 		return;
 	brt_vdevs_free(spa);
 	rw_destroy(&spa->spa_brt_lock);
 	spa->spa_brt_rangesize = 0;
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
 	"Enable prefetching of BRT ZAP entries");
 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
 	"BRT ZAP leaf blockshift");
 ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
 	"BRT ZAP indirect blockshift");
-/* END CSTYLED */
diff --git a/module/zfs/btree.c b/module/zfs/btree.c
index 9c52083603f1..bff2b6c21f44 100644
--- a/module/zfs/btree.c
+++ b/module/zfs/btree.c
@@ -1,2215 +1,2213 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2019 by Delphix. All rights reserved.
  */
 
 #include	<sys/btree.h>
 #include	<sys/bitops.h>
 #include	<sys/zfs_context.h>
 
 kmem_cache_t *zfs_btree_leaf_cache;
 
 /*
  * Control the extent of the verification that occurs when zfs_btree_verify is
  * called. Primarily used for debugging when extending the btree logic and
  * functionality. As the intensity is increased, new verification steps are
  * added. These steps are cumulative; intensity = 3 includes the intensity = 1
  * and intensity = 2 steps as well.
  *
  * Intensity 1: Verify that the tree's height is consistent throughout.
  * Intensity 2: Verify that a core node's children's parent pointers point
  * to the core node.
  * Intensity 3: Verify that the total number of elements in the tree matches the
  * sum of the number of elements in each node. Also verifies that each node's
  * count obeys the invariants (less than or equal to maximum value, greater than
  * or equal to half the maximum minus one).
  * Intensity 4: Verify that each element compares less than the element
  * immediately after it and greater than the one immediately before it using the
  * comparator function. For core nodes, also checks that each element is greater
  * than the last element in the first of the two nodes it separates, and less
  * than the first element in the second of the two nodes.
  * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
  * of each node is poisoned appropriately. Note that poisoning always occurs if
  * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
  * operation.
  *
  * Intensity 4 and 5 are particularly expensive to perform; the previous levels
  * are a few memory operations per node, while these levels require multiple
  * operations per element. In addition, when creating large btrees, these
  * operations are called at every step, resulting in extremely slow operation
  * (while the asymptotic complexity of the other steps is the same, the
  * importance of the constant factors cannot be denied).
  */
 uint_t zfs_btree_verify_intensity = 0;
 
 /*
  * Convenience functions to silence warnings from memcpy/memmove's
  * return values and change argument order to src, dest.
  */
 static void
 bcpy(const void *src, void *dest, size_t size)
 {
 	(void) memcpy(dest, src, size);
 }
 
 static void
 bmov(const void *src, void *dest, size_t size)
 {
 	(void) memmove(dest, src, size);
 }
 
 static boolean_t
 zfs_btree_is_core(struct zfs_btree_hdr *hdr)
 {
 	return (hdr->bth_first == -1);
 }
 
 #ifdef _ILP32
 #define	BTREE_POISON 0xabadb10c
 #else
 #define	BTREE_POISON 0xabadb10cdeadbeef
 #endif
 
 static void
 zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
 	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
 		    i++) {
 			node->btc_children[i] =
 			    (zfs_btree_hdr_t *)BTREE_POISON;
 		}
 		(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
 		    (BTREE_CORE_ELEMS - hdr->bth_count) * size);
 	} else {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 		(void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size);
 		(void) memset(leaf->btl_elems +
 		    (hdr->bth_first + hdr->bth_count) * size, 0x0f,
 		    tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) -
 		    (hdr->bth_first + hdr->bth_count) * size);
 	}
 #endif
 }
 
 static inline void
 zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
     uint32_t idx, uint32_t count)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
 	if (zfs_btree_is_core(hdr)) {
 		ASSERT3U(idx, >=, hdr->bth_count);
 		ASSERT3U(idx, <=, BTREE_CORE_ELEMS);
 		ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS);
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		for (uint32_t i = 1; i <= count; i++) {
 			node->btc_children[idx + i] =
 			    (zfs_btree_hdr_t *)BTREE_POISON;
 		}
 		(void) memset(node->btc_elems + idx * size, 0x0f, count * size);
 	} else {
 		ASSERT3U(idx, <=, tree->bt_leaf_cap);
 		ASSERT3U(idx + count, <=, tree->bt_leaf_cap);
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 		(void) memset(leaf->btl_elems +
 		    (hdr->bth_first + idx) * size, 0x0f, count * size);
 	}
 #endif
 }
 
 static inline void
 zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
     uint32_t idx)
 {
 #ifdef ZFS_DEBUG
 	size_t size = tree->bt_elem_size;
 	if (zfs_btree_is_core(hdr)) {
 		ASSERT3U(idx, <, BTREE_CORE_ELEMS);
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
 		VERIFY3P(node->btc_children[idx + 1], ==, cval);
 		for (size_t i = 0; i < size; i++)
 			VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f);
 	} else  {
 		ASSERT3U(idx, <, tree->bt_leaf_cap);
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 		if (idx >= tree->bt_leaf_cap - hdr->bth_first)
 			return;
 		for (size_t i = 0; i < size; i++) {
 			VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx)
 			    * size + i], ==, 0x0f);
 		}
 	}
 #endif
 }
 
 void
 zfs_btree_init(void)
 {
 	zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
 	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_btree_fini(void)
 {
 	kmem_cache_destroy(zfs_btree_leaf_cache);
 }
 
 static void *
 zfs_btree_leaf_alloc(zfs_btree_t *tree)
 {
 	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
 		return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP));
 	else
 		return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP));
 }
 
 static void
 zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
 {
 	if (tree->bt_leaf_size == BTREE_LEAF_SIZE)
 		return (kmem_cache_free(zfs_btree_leaf_cache, ptr));
 	else
 		return (kmem_free(ptr, tree->bt_leaf_size));
 }
 
 void
 zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
     bt_find_in_buf_f bt_find_in_buf, size_t size)
 {
 	zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
 	    BTREE_LEAF_SIZE);
 }
 
 static void *
 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
     const void *value, zfs_btree_index_t *where);
 
 void
 zfs_btree_create_custom(zfs_btree_t *tree,
     int (*compar) (const void *, const void *),
     bt_find_in_buf_f bt_find_in_buf,
     size_t size, size_t lsize)
 {
 	size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
 
 	ASSERT3U(size, <=, esize / 2);
 	memset(tree, 0, sizeof (*tree));
 	tree->bt_compar = compar;
 	tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
 	    zfs_btree_find_in_buf : bt_find_in_buf;
 	tree->bt_elem_size = size;
 	tree->bt_leaf_size = lsize;
 	tree->bt_leaf_cap = P2ALIGN_TYPED(esize / size, 2, size_t);
 	tree->bt_height = -1;
 	tree->bt_bulk = NULL;
 }
 
 /*
  * Find value in the array of elements provided. Uses a simple binary search.
  */
 static void *
 zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
     const void *value, zfs_btree_index_t *where)
 {
 	uint32_t max = nelems;
 	uint32_t min = 0;
 	while (max > min) {
 		uint32_t idx = (min + max) / 2;
 		uint8_t *cur = buf + idx * tree->bt_elem_size;
 		int comp = tree->bt_compar(cur, value);
 		if (comp < 0) {
 			min = idx + 1;
 		} else if (comp > 0) {
 			max = idx;
 		} else {
 			where->bti_offset = idx;
 			where->bti_before = B_FALSE;
 			return (cur);
 		}
 	}
 
 	where->bti_offset = max;
 	where->bti_before = B_TRUE;
 	return (NULL);
 }
 
 /*
  * Find the given value in the tree. where may be passed as null to use as a
  * membership test or if the btree is being used as a map.
  */
 void *
 zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
 {
 	if (tree->bt_height == -1) {
 		if (where != NULL) {
 			where->bti_node = NULL;
 			where->bti_offset = 0;
 		}
 		ASSERT0(tree->bt_num_elems);
 		return (NULL);
 	}
 
 	/*
 	 * If we're in bulk-insert mode, we check the last spot in the tree
 	 * and the last leaf in the tree before doing the normal search,
 	 * because for most workloads the vast majority of finds in
 	 * bulk-insert mode are to insert new elements.
 	 */
 	zfs_btree_index_t idx;
 	size_t size = tree->bt_elem_size;
 	if (tree->bt_bulk != NULL) {
 		zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
 		int comp = tree->bt_compar(last_leaf->btl_elems +
 		    (last_leaf->btl_hdr.bth_first +
 		    last_leaf->btl_hdr.bth_count - 1) * size, value);
 		if (comp < 0) {
 			/*
 			 * If what they're looking for is after the last
 			 * element, it's not in the tree.
 			 */
 			if (where != NULL) {
 				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
 				where->bti_offset =
 				    last_leaf->btl_hdr.bth_count;
 				where->bti_before = B_TRUE;
 			}
 			return (NULL);
 		} else if (comp == 0) {
 			if (where != NULL) {
 				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
 				where->bti_offset =
 				    last_leaf->btl_hdr.bth_count - 1;
 				where->bti_before = B_FALSE;
 			}
 			return (last_leaf->btl_elems +
 			    (last_leaf->btl_hdr.bth_first +
 			    last_leaf->btl_hdr.bth_count - 1) * size);
 		}
 		if (tree->bt_compar(last_leaf->btl_elems +
 		    last_leaf->btl_hdr.bth_first * size, value) <= 0) {
 			/*
 			 * If what they're looking for is after the first
 			 * element in the last leaf, it's in the last leaf or
 			 * it's not in the tree.
 			 */
 			void *d = tree->bt_find_in_buf(tree,
 			    last_leaf->btl_elems +
 			    last_leaf->btl_hdr.bth_first * size,
 			    last_leaf->btl_hdr.bth_count, value, &idx);
 
 			if (where != NULL) {
 				idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
 				*where = idx;
 			}
 			return (d);
 		}
 	}
 
 	zfs_btree_core_t *node = NULL;
 	uint32_t child = 0;
 	uint32_t depth = 0;
 
 	/*
 	 * Iterate down the tree, finding which child the value should be in
 	 * by comparing with the separators.
 	 */
 	for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
 	    node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
 		ASSERT3P(node, !=, NULL);
 		void *d = tree->bt_find_in_buf(tree, node->btc_elems,
 		    node->btc_hdr.bth_count, value, &idx);
 		EQUIV(d != NULL, !idx.bti_before);
 		if (d != NULL) {
 			if (where != NULL) {
 				idx.bti_node = (zfs_btree_hdr_t *)node;
 				*where = idx;
 			}
 			return (d);
 		}
 		ASSERT(idx.bti_before);
 		child = idx.bti_offset;
 	}
 
 	/*
 	 * The value is in this leaf, or it would be if it were in the
 	 * tree. Find its proper location and return it.
 	 */
 	zfs_btree_leaf_t *leaf = (depth == 0 ?
 	    (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
 	void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
 	    leaf->btl_hdr.bth_first * size,
 	    leaf->btl_hdr.bth_count, value, &idx);
 
 	if (where != NULL) {
 		idx.bti_node = (zfs_btree_hdr_t *)leaf;
 		*where = idx;
 	}
 
 	return (d);
 }
 
 /*
  * To explain the following functions, it is useful to understand the four
  * kinds of shifts used in btree operation. First, a shift is a movement of
  * elements within a node. It is used to create gaps for inserting new
  * elements and children, or cover gaps created when things are removed. A
  * shift has two fundamental properties, each of which can be one of two
  * values, making four types of shifts.  There is the direction of the shift
  * (left or right) and the shape of the shift (parallelogram or isoceles
  * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
  * applies to shifts of core nodes.
  *
  * The names derive from the following imagining of the layout of a node:
  *
  *  Elements:       *   *   *   *   *   *   *   ...   *   *   *
  *  Children:     *   *   *   *   *   *   *   *   ...   *   *   *
  *
  * This layout follows from the fact that the elements act as separators
  * between pairs of children, and that children root subtrees "below" the
  * current node. A left and right shift are fairly self-explanatory; a left
  * shift moves things to the left, while a right shift moves things to the
  * right. A parallelogram shift is a shift with the same number of elements
  * and children being moved, while a trapezoid shift is a shift that moves one
  * more children than elements. An example follows:
  *
  * A parallelogram shift could contain the following:
  *      _______________
  *      \*   *   *   * \ *   *   *   ...   *   *   *
  *     * \ *   *   *   *\  *   *   *   ...   *   *   *
  *        ---------------
  * A trapezoid shift could contain the following:
  *          ___________
  *       * / *   *   * \ *   *   *   ...   *   *   *
  *     *  / *  *   *   *\  *   *   *   ...   *   *   *
  *        ---------------
  *
  * Note that a parallelogram shift is always shaped like a "left-leaning"
  * parallelogram, where the starting index of the children being moved is
  * always one higher than the starting index of the elements being moved. No
  * "right-leaning" parallelogram shifts are needed (shifts where the starting
  * element index and starting child index being moved are the same) to achieve
  * any btree operations, so we ignore them.
  */
 
 enum bt_shift_shape {
 	BSS_TRAPEZOID,
 	BSS_PARALLELOGRAM
 };
 
 enum bt_shift_direction {
 	BSD_LEFT,
 	BSD_RIGHT
 };
 
 /*
  * Shift elements and children in the provided core node by off spots.  The
  * first element moved is idx, and count elements are moved. The shape of the
  * shift is determined by shape. The direction is determined by dir.
  */
 static inline void
 bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
     uint32_t count, uint32_t off, enum bt_shift_shape shape,
     enum bt_shift_direction dir)
 {
 	size_t size = tree->bt_elem_size;
 	ASSERT(zfs_btree_is_core(&node->btc_hdr));
 
 	uint8_t *e_start = node->btc_elems + idx * size;
 	uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size :
 	    e_start + off * size);
 	bmov(e_start, e_out, count * size);
 
 	zfs_btree_hdr_t **c_start = node->btc_children + idx +
 	    (shape == BSS_TRAPEZOID ? 0 : 1);
 	zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
 	    c_start + off);
 	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
 	bmov(c_start, c_out, c_count * sizeof (*c_start));
 }
 
 /*
  * Shift elements and children in the provided core node left by one spot.
  * The first element moved is idx, and count elements are moved. The
  * shape of the shift is determined by trap; true if the shift is a trapezoid,
  * false if it is a parallelogram.
  */
 static inline void
 bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
     uint32_t count, enum bt_shift_shape shape)
 {
 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
 }
 
 /*
  * Shift elements and children in the provided core node right by one spot.
  * Starts with elements[idx] and children[idx] and one more child than element.
  */
 static inline void
 bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx,
     uint32_t count, enum bt_shift_shape shape)
 {
 	bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
 }
 
 /*
  * Shift elements and children in the provided leaf node by off spots.
  * The first element moved is idx, and count elements are moved. The direction
  * is determined by left.
  */
 static inline void
 bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx,
     uint32_t count, uint32_t off, enum bt_shift_direction dir)
 {
 	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *hdr = &node->btl_hdr;
 	ASSERT(!zfs_btree_is_core(hdr));
 
 	if (count == 0)
 		return;
 	uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size;
 	uint8_t *out = (dir == BSD_LEFT ? start - off * size :
 	    start + off * size);
 	bmov(start, out, count * size);
 }
 
 /*
  * Grow leaf for n new elements before idx.
  */
 static void
 bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
     uint32_t n)
 {
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 	ASSERT(!zfs_btree_is_core(hdr));
 	ASSERT3U(idx, <=, hdr->bth_count);
 	uint32_t capacity = tree->bt_leaf_cap;
 	ASSERT3U(hdr->bth_count + n, <=, capacity);
 	boolean_t cl = (hdr->bth_first >= n);
 	boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity);
 
 	if (cl && (!cr || idx <= hdr->bth_count / 2)) {
 		/* Grow left. */
 		hdr->bth_first -= n;
 		bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT);
 	} else if (cr) {
 		/* Grow right. */
 		bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n,
 		    BSD_RIGHT);
 	} else {
 		/* Grow both ways. */
 		uint32_t fn = hdr->bth_first -
 		    (capacity - (hdr->bth_count + n)) / 2;
 		hdr->bth_first -= fn;
 		bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT);
 		bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx,
 		    n - fn, BSD_RIGHT);
 	}
 	hdr->bth_count += n;
 }
 
 /*
  * Shrink leaf for count elements starting from idx.
  */
 static void
 bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx,
     uint32_t n)
 {
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 	ASSERT(!zfs_btree_is_core(hdr));
 	ASSERT3U(idx, <=, hdr->bth_count);
 	ASSERT3U(idx + n, <=, hdr->bth_count);
 
 	if (idx <= (hdr->bth_count - n) / 2) {
 		bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT);
 		zfs_btree_poison_node_at(tree, hdr, 0, n);
 		hdr->bth_first += n;
 	} else {
 		bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n,
 		    BSD_LEFT);
 		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n);
 	}
 	hdr->bth_count -= n;
 }
 
 /*
  * Move children and elements from one core node to another. The shape
  * parameter behaves the same as it does in the shift logic.
  */
 static inline void
 bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx,
     uint32_t count, zfs_btree_core_t *dest, uint32_t didx,
     enum bt_shift_shape shape)
 {
 	size_t size = tree->bt_elem_size;
 	ASSERT(zfs_btree_is_core(&source->btc_hdr));
 	ASSERT(zfs_btree_is_core(&dest->btc_hdr));
 
 	bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
 	    count * size);
 
 	uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
 	bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
 	    dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
 	    c_count * sizeof (*source->btc_children));
 }
 
 static inline void
 bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx,
     uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx)
 {
 	size_t size = tree->bt_elem_size;
 	ASSERT(!zfs_btree_is_core(&source->btl_hdr));
 	ASSERT(!zfs_btree_is_core(&dest->btl_hdr));
 
 	bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size,
 	    dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size,
 	    count * size);
 }
 
 /*
  * Find the first element in the subtree rooted at hdr, return its value and
  * put its location in where if non-null.
  */
 static void *
 zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
     zfs_btree_index_t *where)
 {
 	zfs_btree_hdr_t *node;
 
 	for (node = hdr; zfs_btree_is_core(node);
 	    node = ((zfs_btree_core_t *)node)->btc_children[0])
 		;
 
 	ASSERT(!zfs_btree_is_core(node));
 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
 	if (where != NULL) {
 		where->bti_node = node;
 		where->bti_offset = 0;
 		where->bti_before = B_FALSE;
 	}
 	return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]);
 }
 
 /* Insert an element and a child into a core node at the given offset. */
 static void
 zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
     uint32_t offset, zfs_btree_hdr_t *new_node, void *buf)
 {
 	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
 	ASSERT3P(par_hdr, ==, new_node->bth_parent);
 	ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
 
 	if (zfs_btree_verify_intensity >= 5) {
 		zfs_btree_verify_poison_at(tree, par_hdr,
 		    par_hdr->bth_count);
 	}
 	/* Shift existing elements and children */
 	uint32_t count = par_hdr->bth_count - offset;
 	bt_shift_core_right(tree, parent, offset, count,
 	    BSS_PARALLELOGRAM);
 
 	/* Insert new values */
 	parent->btc_children[offset + 1] = new_node;
 	bcpy(buf, parent->btc_elems + offset * size, size);
 	par_hdr->bth_count++;
 }
 
 /*
  * Insert new_node into the parent of old_node directly after old_node, with
  * buf as the dividing element between the two.
  */
 static void
 zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
     zfs_btree_hdr_t *new_node, void *buf)
 {
 	ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
 	size_t size = tree->bt_elem_size;
 	zfs_btree_core_t *parent = old_node->bth_parent;
 
 	/*
 	 * If this is the root node we were splitting, we create a new root
 	 * and increase the height of the tree.
 	 */
 	if (parent == NULL) {
 		ASSERT3P(old_node, ==, tree->bt_root);
 		tree->bt_num_nodes++;
 		zfs_btree_core_t *new_root =
 		    kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
 		    size, KM_SLEEP);
 		zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
 		new_root_hdr->bth_parent = NULL;
 		new_root_hdr->bth_first = -1;
 		new_root_hdr->bth_count = 1;
 
 		old_node->bth_parent = new_node->bth_parent = new_root;
 		new_root->btc_children[0] = old_node;
 		new_root->btc_children[1] = new_node;
 		bcpy(buf, new_root->btc_elems, size);
 
 		tree->bt_height++;
 		tree->bt_root = new_root_hdr;
 		zfs_btree_poison_node(tree, new_root_hdr);
 		return;
 	}
 
 	/*
 	 * Since we have the new separator, binary search for where to put
 	 * new_node.
 	 */
 	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
 	zfs_btree_index_t idx;
 	ASSERT(zfs_btree_is_core(par_hdr));
 	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
 	    par_hdr->bth_count, buf, &idx), ==, NULL);
 	ASSERT(idx.bti_before);
 	uint32_t offset = idx.bti_offset;
 	ASSERT3U(offset, <=, par_hdr->bth_count);
 	ASSERT3P(parent->btc_children[offset], ==, old_node);
 
 	/*
 	 * If the parent isn't full, shift things to accommodate our insertions
 	 * and return.
 	 */
 	if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
 		zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
 		return;
 	}
 
 	/*
 	 * We need to split this core node into two. Currently there are
 	 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
 	 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
 	 * current node, and the others will be moved to the new core node.
 	 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
 	 * will be used as the new separator in our parent, and the others
 	 * will be split among the two core nodes.
 	 *
 	 * Usually we will split the node in half evenly, with
 	 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
 	 * instead move only about a quarter of the elements (and children) to
 	 * the new node. Since the average state after a long time is a 3/4
 	 * full node, shortcutting directly to that state improves efficiency.
 	 *
 	 * We do this in two stages: first we split into two nodes, and then we
 	 * reuse our existing logic to insert the new element and child.
 	 */
 	uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
 	    2 : 4)) - 1, 2);
 	uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
 	ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
 	tree->bt_num_nodes++;
 	zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
 	    BTREE_CORE_ELEMS * size, KM_SLEEP);
 	zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
 	new_par_hdr->bth_parent = par_hdr->bth_parent;
 	new_par_hdr->bth_first = -1;
 	new_par_hdr->bth_count = move_count;
 	zfs_btree_poison_node(tree, new_par_hdr);
 
 	par_hdr->bth_count = keep_count;
 
 	bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
 	    0, BSS_TRAPEZOID);
 
 	/* Store the new separator in a buffer. */
 	uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
 	bcpy(parent->btc_elems + keep_count * size, tmp_buf,
 	    size);
 	zfs_btree_poison_node(tree, par_hdr);
 
 	if (offset < keep_count) {
 		/* Insert the new node into the left half */
 		zfs_btree_insert_core_impl(tree, parent, offset, new_node,
 		    buf);
 
 		/*
 		 * Move the new separator to the existing buffer.
 		 */
 		bcpy(tmp_buf, buf, size);
 	} else if (offset > keep_count) {
 		/* Insert the new node into the right half */
 		new_node->bth_parent = new_parent;
 		zfs_btree_insert_core_impl(tree, new_parent,
 		    offset - keep_count - 1, new_node, buf);
 
 		/*
 		 * Move the new separator to the existing buffer.
 		 */
 		bcpy(tmp_buf, buf, size);
 	} else {
 		/*
 		 * Move the new separator into the right half, and replace it
 		 * with buf. We also need to shift back the elements in the
 		 * right half to accommodate new_node.
 		 */
 		bt_shift_core_right(tree, new_parent, 0, move_count,
 		    BSS_TRAPEZOID);
 		new_parent->btc_children[0] = new_node;
 		bcpy(tmp_buf, new_parent->btc_elems, size);
 		new_par_hdr->bth_count++;
 	}
 	kmem_free(tmp_buf, size);
 	zfs_btree_poison_node(tree, par_hdr);
 
 	for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++)
 		new_parent->btc_children[i]->bth_parent = new_parent;
 
 	for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++)
 		ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
 
 	/*
 	 * Now that the node is split, we need to insert the new node into its
 	 * parent. This may cause further splitting.
 	 */
 	zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
 	    &new_parent->btc_hdr, buf);
 }
 
 /* Insert an element into a leaf node at the given offset. */
 static void
 zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
     uint32_t idx, const void *value)
 {
 	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 	ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap);
 
 	if (zfs_btree_verify_intensity >= 5) {
 		zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
 		    leaf->btl_hdr.bth_count);
 	}
 
 	bt_grow_leaf(tree, leaf, idx, 1);
 	uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size;
 	bcpy(value, start, size);
 }
 
 static void
 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr);
 
 /* Helper function for inserting a new value into leaf at the given index. */
 static void
 zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
     const void *value, uint32_t idx)
 {
 	size_t size = tree->bt_elem_size;
 	uint32_t capacity = tree->bt_leaf_cap;
 
 	/*
 	 * If the leaf isn't full, shift the elements after idx and insert
 	 * value.
 	 */
 	if (leaf->btl_hdr.bth_count != capacity) {
 		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
 		return;
 	}
 
 	/*
 	 * Otherwise, we split the leaf node into two nodes. If we're not bulk
 	 * inserting, each is of size (capacity / 2).  If we are bulk
 	 * inserting, we move a quarter of the elements to the new node so
 	 * inserts into the old node don't cause immediate splitting but the
 	 * tree stays relatively dense. Since the average state after a long
 	 * time is a 3/4 full node, shortcutting directly to that state
 	 * improves efficiency.  At the end of the bulk insertion process
 	 * we'll need to go through and fix up any nodes (the last leaf and
 	 * its ancestors, potentially) that are below the minimum.
 	 *
 	 * In either case, we're left with one extra element. The leftover
 	 * element will become the new dividing element between the two nodes.
 	 */
 	uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1;
 	uint32_t keep_count = capacity - move_count - 1;
 	ASSERT3U(keep_count, >=, 1);
 	/* If we insert on left. move one more to keep leaves balanced.  */
 	if (idx < keep_count) {
 		keep_count--;
 		move_count++;
 	}
 	tree->bt_num_nodes++;
 	zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree);
 	zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
 	new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
 	new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) +
 	    (idx >= keep_count && idx <= keep_count + move_count / 2);
 	new_hdr->bth_count = move_count;
 	zfs_btree_poison_node(tree, new_hdr);
 
 	if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
 		tree->bt_bulk = new_leaf;
 
 	/* Copy the back part to the new leaf. */
 	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0);
 
 	/* We store the new separator in a buffer we control for simplicity. */
 	uint8_t *buf = kmem_alloc(size, KM_SLEEP);
 	bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size,
 	    buf, size);
 
 	bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count);
 
 	if (idx < keep_count) {
 		/* Insert into the existing leaf. */
 		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
 	} else if (idx > keep_count) {
 		/* Insert into the new leaf. */
 		zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
 		    1, value);
 	} else {
 		/*
 		 * Insert planned separator into the new leaf, and use
 		 * the new value as the new separator.
 		 */
 		zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf);
 		bcpy(value, buf, size);
 	}
 
 	/*
 	 * Now that the node is split, we need to insert the new node into its
 	 * parent. This may cause further splitting, bur only of core nodes.
 	 */
 	zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
 	    buf);
 	kmem_free(buf, size);
 }
 
 static uint32_t
 zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	void *buf;
 	if (zfs_btree_is_core(hdr)) {
 		buf = ((zfs_btree_core_t *)hdr)->btc_elems;
 	} else {
 		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems +
 		    hdr->bth_first * tree->bt_elem_size;
 	}
 	zfs_btree_index_t idx;
 	zfs_btree_core_t *parent = hdr->bth_parent;
 	VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
 	    parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
 	ASSERT(idx.bti_before);
 	ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
 	ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
 	return (idx.bti_offset);
 }
 
 /*
  * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
  * nodes may violate the invariant that non-root nodes must be at least half
  * full. All nodes violating this invariant should be the last node in their
  * particular level. To correct the invariant, we take values from their left
  * neighbor until they are half full. They must have a left neighbor at their
  * level because the last node at a level is not the first node unless it's
  * the root.
  */
 static void
 zfs_btree_bulk_finish(zfs_btree_t *tree)
 {
 	ASSERT3P(tree->bt_bulk, !=, NULL);
 	ASSERT3P(tree->bt_root, !=, NULL);
 	zfs_btree_leaf_t *leaf = tree->bt_bulk;
 	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 	zfs_btree_core_t *parent = hdr->bth_parent;
 	size_t size = tree->bt_elem_size;
 	uint32_t capacity = tree->bt_leaf_cap;
 
 	/*
 	 * The invariant doesn't apply to the root node, if that's the only
 	 * node in the tree we're done.
 	 */
 	if (parent == NULL) {
 		tree->bt_bulk = NULL;
 		return;
 	}
 
 	/* First, take elements to rebalance the leaf node. */
 	if (hdr->bth_count < capacity / 2) {
 		/*
 		 * First, find the left neighbor. The simplest way to do this
 		 * is to call zfs_btree_prev twice; the first time finds some
 		 * ancestor of this node, and the second time finds the left
 		 * neighbor. The ancestor found is the lowest common ancestor
 		 * of leaf and the neighbor.
 		 */
 		zfs_btree_index_t idx = {
 			.bti_node = hdr,
 			.bti_offset = 0
 		};
 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
 		ASSERT(zfs_btree_is_core(idx.bti_node));
 		zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
 		uint32_t common_idx = idx.bti_offset;
 
 		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
 		ASSERT(!zfs_btree_is_core(idx.bti_node));
 		zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
 		zfs_btree_hdr_t *l_hdr = idx.bti_node;
 		uint32_t move_count = (capacity / 2) - hdr->bth_count;
 		ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
 		    capacity / 2);
 
 		if (zfs_btree_verify_intensity >= 5) {
 			for (uint32_t i = 0; i < move_count; i++) {
 				zfs_btree_verify_poison_at(tree, hdr,
 				    leaf->btl_hdr.bth_count + i);
 			}
 		}
 
 		/* First, shift elements in leaf back. */
 		bt_grow_leaf(tree, leaf, 0, move_count);
 
 		/* Next, move the separator from the common ancestor to leaf. */
 		uint8_t *separator = common->btc_elems + common_idx * size;
 		uint8_t *out = leaf->btl_elems +
 		    (hdr->bth_first + move_count - 1) * size;
 		bcpy(separator, out, size);
 
 		/*
 		 * Now we move elements from the tail of the left neighbor to
 		 * fill the remaining spots in leaf.
 		 */
 		bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
 		    (move_count - 1), move_count - 1, leaf, 0);
 
 		/*
 		 * Finally, move the new last element in the left neighbor to
 		 * the separator.
 		 */
 		bcpy(l_neighbor->btl_elems + (l_hdr->bth_first +
 		    l_hdr->bth_count - move_count) * size, separator, size);
 
 		/* Adjust the node's counts, and we're done. */
 		bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count,
 		    move_count);
 
 		ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
 		ASSERT3U(hdr->bth_count, >=, capacity / 2);
 	}
 
 	/*
 	 * Now we have to rebalance any ancestors of leaf that may also
 	 * violate the invariant.
 	 */
 	capacity = BTREE_CORE_ELEMS;
 	while (parent->btc_hdr.bth_parent != NULL) {
 		zfs_btree_core_t *cur = parent;
 		zfs_btree_hdr_t *hdr = &cur->btc_hdr;
 		parent = hdr->bth_parent;
 		/*
 		 * If the invariant isn't violated, move on to the next
 		 * ancestor.
 		 */
 		if (hdr->bth_count >= capacity / 2)
 			continue;
 
 		/*
 		 * Because the smallest number of nodes we can move when
 		 * splitting is 2, we never need to worry about not having a
 		 * left sibling (a sibling is a neighbor with the same parent).
 		 */
 		uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 		ASSERT3U(parent_idx, >, 0);
 		zfs_btree_core_t *l_neighbor =
 		    (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
 		uint32_t move_count = (capacity / 2) - hdr->bth_count;
 		ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
 		    capacity / 2);
 
 		if (zfs_btree_verify_intensity >= 5) {
 			for (uint32_t i = 0; i < move_count; i++) {
 				zfs_btree_verify_poison_at(tree, hdr,
 				    hdr->bth_count + i);
 			}
 		}
 		/* First, shift things in the right node back. */
 		bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
 		    BSS_TRAPEZOID, BSD_RIGHT);
 
 		/* Next, move the separator to the right node. */
 		uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
 		    size);
 		uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
 		bcpy(separator, e_out, size);
 
 		/*
 		 * Now, move elements and children from the left node to the
 		 * right.  We move one more child than elements.
 		 */
 		move_count--;
 		uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
 		bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
 		    BSS_TRAPEZOID);
 
 		/*
 		 * Finally, move the last element in the left node to the
 		 * separator's position.
 		 */
 		move_idx--;
 		bcpy(l_neighbor->btc_elems + move_idx * size, separator, size);
 
 		l_neighbor->btc_hdr.bth_count -= move_count + 1;
 		hdr->bth_count += move_count + 1;
 
 		ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
 		ASSERT3U(hdr->bth_count, >=, capacity / 2);
 
 		zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
 
 		for (uint32_t i = 0; i <= hdr->bth_count; i++)
 			cur->btc_children[i]->bth_parent = cur;
 	}
 
 	tree->bt_bulk = NULL;
 	zfs_btree_verify(tree);
 }
 
 /*
  * Insert value into tree at the location specified by where.
  */
 void
 zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
     const zfs_btree_index_t *where)
 {
 	zfs_btree_index_t idx = {0};
 
 	/* If we're not inserting in the last leaf, end bulk insert mode. */
 	if (tree->bt_bulk != NULL) {
 		if (where->bti_node != &tree->bt_bulk->btl_hdr) {
 			zfs_btree_bulk_finish(tree);
 			VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
 			where = &idx;
 		}
 	}
 
 	tree->bt_num_elems++;
 	/*
 	 * If this is the first element in the tree, create a leaf root node
 	 * and add the value to it.
 	 */
 	if (where->bti_node == NULL) {
 		ASSERT3U(tree->bt_num_elems, ==, 1);
 		ASSERT3S(tree->bt_height, ==, -1);
 		ASSERT3P(tree->bt_root, ==, NULL);
 		ASSERT0(where->bti_offset);
 
 		tree->bt_num_nodes++;
 		zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree);
 		tree->bt_root = &leaf->btl_hdr;
 		tree->bt_height++;
 
 		zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
 		hdr->bth_parent = NULL;
 		hdr->bth_first = 0;
 		hdr->bth_count = 0;
 		zfs_btree_poison_node(tree, hdr);
 
 		zfs_btree_insert_into_leaf(tree, leaf, value, 0);
 		tree->bt_bulk = leaf;
 	} else if (!zfs_btree_is_core(where->bti_node)) {
 		/*
 		 * If we're inserting into a leaf, go directly to the helper
 		 * function.
 		 */
 		zfs_btree_insert_into_leaf(tree,
 		    (zfs_btree_leaf_t *)where->bti_node, value,
 		    where->bti_offset);
 	} else {
 		/*
 		 * If we're inserting into a core node, we can't just shift
 		 * the existing element in that slot in the same node without
 		 * breaking our ordering invariants. Instead we place the new
 		 * value in the node at that spot and then insert the old
 		 * separator into the first slot in the subtree to the right.
 		 */
 		zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
 
 		/*
 		 * We can ignore bti_before, because either way the value
 		 * should end up in bti_offset.
 		 */
 		uint32_t off = where->bti_offset;
 		zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
 		size_t size = tree->bt_elem_size;
 		uint8_t *buf = kmem_alloc(size, KM_SLEEP);
 		bcpy(node->btc_elems + off * size, buf, size);
 		bcpy(value, node->btc_elems + off * size, size);
 
 		/*
 		 * Find the first slot in the subtree to the right, insert
 		 * there.
 		 */
 		zfs_btree_index_t new_idx;
 		VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=,
 		    NULL);
 		ASSERT0(new_idx.bti_offset);
 		ASSERT(!zfs_btree_is_core(new_idx.bti_node));
 		zfs_btree_insert_into_leaf(tree,
 		    (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
 		kmem_free(buf, size);
 	}
 	zfs_btree_verify(tree);
 }
 
 /*
  * Return the first element in the tree, and put its location in where if
  * non-null.
  */
 void *
 zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
 {
 	if (tree->bt_height == -1) {
 		ASSERT0(tree->bt_num_elems);
 		return (NULL);
 	}
 	return (zfs_btree_first_helper(tree, tree->bt_root, where));
 }
 
 /*
  * Find the last element in the subtree rooted at hdr, return its value and
  * put its location in where if non-null.
  */
 static void *
 zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
     zfs_btree_index_t *where)
 {
 	zfs_btree_hdr_t *node;
 
 	for (node = hdr; zfs_btree_is_core(node); node =
 	    ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
 		;
 
 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
 	if (where != NULL) {
 		where->bti_node = node;
 		where->bti_offset = node->bth_count - 1;
 		where->bti_before = B_FALSE;
 	}
 	return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) *
 	    btree->bt_elem_size);
 }
 
 /*
  * Return the last element in the tree, and put its location in where if
  * non-null.
  */
 void *
 zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
 {
 	if (tree->bt_height == -1) {
 		ASSERT0(tree->bt_num_elems);
 		return (NULL);
 	}
 	return (zfs_btree_last_helper(tree, tree->bt_root, where));
 }
 
 /*
  * This function contains the logic to find the next node in the tree. A
  * helper function is used because there are multiple internal consumemrs of
  * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
  * node after we've finished with it.
  */
 static void *
 zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
     zfs_btree_index_t *out_idx,
     void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
 {
 	if (idx->bti_node == NULL) {
 		ASSERT3S(tree->bt_height, ==, -1);
 		return (NULL);
 	}
 
 	uint32_t offset = idx->bti_offset;
 	if (!zfs_btree_is_core(idx->bti_node)) {
 		/*
 		 * When finding the next element of an element in a leaf,
 		 * there are two cases. If the element isn't the last one in
 		 * the leaf, in which case we just return the next element in
 		 * the leaf. Otherwise, we need to traverse up our parents
 		 * until we find one where our ancestor isn't the last child
 		 * of its parent. Once we do, the next element is the
 		 * separator after our ancestor in its parent.
 		 */
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
 		uint32_t new_off = offset + (idx->bti_before ? 0 : 1);
 		if (leaf->btl_hdr.bth_count > new_off) {
 			out_idx->bti_node = &leaf->btl_hdr;
 			out_idx->bti_offset = new_off;
 			out_idx->bti_before = B_FALSE;
 			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
 			    new_off) * tree->bt_elem_size);
 		}
 
 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
 		    node != NULL; node = node->btc_hdr.bth_parent) {
 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
 			ASSERT(zfs_btree_is_core(hdr));
 			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
 			if (done_func != NULL)
 				done_func(tree, prev);
 			if (i == hdr->bth_count) {
 				prev = hdr;
 				continue;
 			}
 			out_idx->bti_node = hdr;
 			out_idx->bti_offset = i;
 			out_idx->bti_before = B_FALSE;
 			return (node->btc_elems + i * tree->bt_elem_size);
 		}
 		if (done_func != NULL)
 			done_func(tree, prev);
 		/*
 		 * We've traversed all the way up and been at the end of the
 		 * node every time, so this was the last element in the tree.
 		 */
 		return (NULL);
 	}
 
 	/* If we were before an element in a core node, return that element. */
 	ASSERT(zfs_btree_is_core(idx->bti_node));
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
 	if (idx->bti_before) {
 		out_idx->bti_before = B_FALSE;
 		return (node->btc_elems + offset * tree->bt_elem_size);
 	}
 
 	/*
 	 * The next element from one in a core node is the first element in
 	 * the subtree just to the right of the separator.
 	 */
 	zfs_btree_hdr_t *child = node->btc_children[offset + 1];
 	return (zfs_btree_first_helper(tree, child, out_idx));
 }
 
 /*
  * Return the next valued node in the tree.  The same address can be safely
  * passed for idx and out_idx.
  */
 void *
 zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
     zfs_btree_index_t *out_idx)
 {
 	return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
 }
 
 /*
  * Return the previous valued node in the tree.  The same value can be safely
  * passed for idx and out_idx.
  */
 void *
 zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
     zfs_btree_index_t *out_idx)
 {
 	if (idx->bti_node == NULL) {
 		ASSERT3S(tree->bt_height, ==, -1);
 		return (NULL);
 	}
 
 	uint32_t offset = idx->bti_offset;
 	if (!zfs_btree_is_core(idx->bti_node)) {
 		/*
 		 * When finding the previous element of an element in a leaf,
 		 * there are two cases. If the element isn't the first one in
 		 * the leaf, in which case we just return the previous element
 		 * in the leaf. Otherwise, we need to traverse up our parents
 		 * until we find one where our previous ancestor isn't the
 		 * first child. Once we do, the previous element is the
 		 * separator after our previous ancestor.
 		 */
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
 		if (offset != 0) {
 			out_idx->bti_node = &leaf->btl_hdr;
 			out_idx->bti_offset = offset - 1;
 			out_idx->bti_before = B_FALSE;
 			return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
 			    offset - 1) * tree->bt_elem_size);
 		}
 		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
 		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
 		    node != NULL; node = node->btc_hdr.bth_parent) {
 			zfs_btree_hdr_t *hdr = &node->btc_hdr;
 			ASSERT(zfs_btree_is_core(hdr));
 			uint32_t i = zfs_btree_find_parent_idx(tree, prev);
 			if (i == 0) {
 				prev = hdr;
 				continue;
 			}
 			out_idx->bti_node = hdr;
 			out_idx->bti_offset = i - 1;
 			out_idx->bti_before = B_FALSE;
 			return (node->btc_elems + (i - 1) * tree->bt_elem_size);
 		}
 		/*
 		 * We've traversed all the way up and been at the start of the
 		 * node every time, so this was the first node in the tree.
 		 */
 		return (NULL);
 	}
 
 	/*
 	 * The previous element from one in a core node is the last element in
 	 * the subtree just to the left of the separator.
 	 */
 	ASSERT(zfs_btree_is_core(idx->bti_node));
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
 	zfs_btree_hdr_t *child = node->btc_children[offset];
 	return (zfs_btree_last_helper(tree, child, out_idx));
 }
 
 /*
  * Get the value at the provided index in the tree.
  *
  * Note that the value returned from this function can be mutated, but only
  * if it will not change the ordering of the element with respect to any other
  * elements that could be in the tree.
  */
 void *
 zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
 {
 	ASSERT(!idx->bti_before);
 	size_t size = tree->bt_elem_size;
 	if (!zfs_btree_is_core(idx->bti_node)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
 		return (leaf->btl_elems + (leaf->btl_hdr.bth_first +
 		    idx->bti_offset) * size);
 	}
 	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
 	return (node->btc_elems + idx->bti_offset * size);
 }
 
 /* Add the given value to the tree. Must not already be in the tree. */
 void
 zfs_btree_add(zfs_btree_t *tree, const void *node)
 {
 	zfs_btree_index_t where = {0};
 	VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
 	zfs_btree_add_idx(tree, node, &where);
 }
 
 /* Helper function to free a tree node. */
 static void
 zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
 {
 	tree->bt_num_nodes--;
 	if (!zfs_btree_is_core(node)) {
 		zfs_btree_leaf_free(tree, node);
 	} else {
 		kmem_free(node, sizeof (zfs_btree_core_t) +
 		    BTREE_CORE_ELEMS * tree->bt_elem_size);
 	}
 }
 
 /*
  * Remove the rm_hdr and the separator to its left from the parent node. The
  * buffer that rm_hdr was stored in may already be freed, so its contents
  * cannot be accessed.
  */
 static void
 zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
     zfs_btree_hdr_t *rm_hdr)
 {
 	size_t size = tree->bt_elem_size;
 	uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
 	zfs_btree_hdr_t *hdr = &node->btc_hdr;
 	/*
 	 * If the node is the root node and rm_hdr is one of two children,
 	 * promote the other child to the root.
 	 */
 	if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
 		ASSERT3U(hdr->bth_count, ==, 1);
 		ASSERT3P(tree->bt_root, ==, node);
 		ASSERT3P(node->btc_children[1], ==, rm_hdr);
 		tree->bt_root = node->btc_children[0];
 		node->btc_children[0]->bth_parent = NULL;
 		zfs_btree_node_destroy(tree, hdr);
 		tree->bt_height--;
 		return;
 	}
 
 	uint32_t idx;
 	for (idx = 0; idx <= hdr->bth_count; idx++) {
 		if (node->btc_children[idx] == rm_hdr)
 			break;
 	}
 	ASSERT3U(idx, <=, hdr->bth_count);
 
 	/*
 	 * If the node is the root or it has more than the minimum number of
 	 * children, just remove the child and separator, and return.
 	 */
 	if (hdr->bth_parent == NULL ||
 	    hdr->bth_count > min_count) {
 		/*
 		 * Shift the element and children to the right of rm_hdr to
 		 * the left by one spot.
 		 */
 		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
 		    BSS_PARALLELOGRAM);
 		hdr->bth_count--;
 		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1);
 		return;
 	}
 
 	ASSERT3U(hdr->bth_count, ==, min_count);
 
 	/*
 	 * Now we try to take a node from a neighbor. We check left, then
 	 * right. If the neighbor exists and has more than the minimum number
 	 * of elements, we move the separator between us and them to our
 	 * node, move their closest element (last for left, first for right)
 	 * to the separator, and move their closest child to our node. Along
 	 * the way we need to collapse the gap made by idx, and (for our right
 	 * neighbor) the gap made by removing their first element and child.
 	 *
 	 * Note: this logic currently doesn't support taking from a neighbor
 	 * that isn't a sibling (i.e. a neighbor with a different
 	 * parent). This isn't critical functionality, but may be worth
 	 * implementing in the future for completeness' sake.
 	 */
 	zfs_btree_core_t *parent = hdr->bth_parent;
 	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 
 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
 	    parent->btc_children[parent_idx - 1]);
 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
 		/* We can take a node from the left neighbor. */
 		ASSERT(zfs_btree_is_core(l_hdr));
 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
 
 		/*
 		 * Start by shifting the elements and children in the current
 		 * node to the right by one spot.
 		 */
 		bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
 
 		/*
 		 * Move the separator between node and neighbor to the first
 		 * element slot in the current node.
 		 */
 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 		    size;
 		bcpy(separator, node->btc_elems, size);
 
 		/* Move the last child of neighbor to our first child slot. */
 		node->btc_children[0] =
 		    neighbor->btc_children[l_hdr->bth_count];
 		node->btc_children[0]->bth_parent = node;
 
 		/* Move the last element of neighbor to the separator spot. */
 		uint8_t *take_elem = neighbor->btc_elems +
 		    (l_hdr->bth_count - 1) * size;
 		bcpy(take_elem, separator, size);
 		l_hdr->bth_count--;
 		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1);
 		return;
 	}
 
 	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
 	    NULL : parent->btc_children[parent_idx + 1]);
 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
 		/* We can take a node from the right neighbor. */
 		ASSERT(zfs_btree_is_core(r_hdr));
 		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
 
 		/*
 		 * Shift elements in node left by one spot to overwrite rm_hdr
 		 * and the separator before it.
 		 */
 		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
 		    BSS_PARALLELOGRAM);
 
 		/*
 		 * Move the separator between node and neighbor to the last
 		 * element spot in node.
 		 */
 		uint8_t *separator = parent->btc_elems + parent_idx * size;
 		bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size,
 		    size);
 
 		/*
 		 * Move the first child of neighbor to the last child spot in
 		 * node.
 		 */
 		node->btc_children[hdr->bth_count] = neighbor->btc_children[0];
 		node->btc_children[hdr->bth_count]->bth_parent = node;
 
 		/* Move the first element of neighbor to the separator spot. */
 		uint8_t *take_elem = neighbor->btc_elems;
 		bcpy(take_elem, separator, size);
 		r_hdr->bth_count--;
 
 		/*
 		 * Shift the elements and children of neighbor to cover the
 		 * stolen elements.
 		 */
 		bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
 		    BSS_TRAPEZOID);
 		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1);
 		return;
 	}
 
 	/*
 	 * In this case, neither of our neighbors can spare an element, so we
 	 * need to merge with one of them. We prefer the left one,
 	 * arbitrarily. Move the separator into the leftmost merging node
 	 * (which may be us or the left neighbor), and then move the right
 	 * merging node's elements. Once that's done, we go back and delete
 	 * the element we're removing. Finally, go into the parent and delete
 	 * the right merging node and the separator. This may cause further
 	 * merging.
 	 */
 	zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
 	uint32_t new_idx = idx;
 	if (l_hdr != NULL) {
 		keep_hdr = l_hdr;
 		new_rm_hdr = hdr;
 		new_idx += keep_hdr->bth_count + 1;
 	} else {
 		ASSERT3P(r_hdr, !=, NULL);
 		keep_hdr = hdr;
 		new_rm_hdr = r_hdr;
 		parent_idx++;
 	}
 
 	ASSERT(zfs_btree_is_core(keep_hdr));
 	ASSERT(zfs_btree_is_core(new_rm_hdr));
 
 	zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
 	zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
 
 	if (zfs_btree_verify_intensity >= 5) {
 		for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) {
 			zfs_btree_verify_poison_at(tree, keep_hdr,
 			    keep_hdr->bth_count + i);
 		}
 	}
 
 	/* Move the separator into the left node. */
 	uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
 	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 	    size;
 	bcpy(separator, e_out, size);
 	keep_hdr->bth_count++;
 
 	/* Move all our elements and children into the left node. */
 	bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
 	    keep_hdr->bth_count, BSS_TRAPEZOID);
 
 	uint32_t old_count = keep_hdr->bth_count;
 
 	/* Update bookkeeping */
 	keep_hdr->bth_count += new_rm_hdr->bth_count;
 	ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
 
 	/*
 	 * Shift the element and children to the right of rm_hdr to
 	 * the left by one spot.
 	 */
 	ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
 	bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
 	    BSS_PARALLELOGRAM);
 	keep_hdr->bth_count--;
 
 	/* Reparent all our children to point to the left node. */
 	zfs_btree_hdr_t **new_start = keep->btc_children +
 	    old_count - 1;
 	for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++)
 		new_start[i]->bth_parent = keep;
 	for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) {
 		ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
 		ASSERT3P(keep->btc_children[i], !=, rm_hdr);
 	}
 	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
 
 	new_rm_hdr->bth_count = 0;
 	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
 	zfs_btree_node_destroy(tree, new_rm_hdr);
 }
 
 /* Remove the element at the specific location. */
 void
 zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 {
 	size_t size = tree->bt_elem_size;
 	zfs_btree_hdr_t *hdr = where->bti_node;
 	uint32_t idx = where->bti_offset;
 
 	ASSERT(!where->bti_before);
 	if (tree->bt_bulk != NULL) {
 		/*
 		 * Leave bulk insert mode. Note that our index would be
 		 * invalid after we correct the tree, so we copy the value
 		 * we're planning to remove and find it again after
 		 * bulk_finish.
 		 */
 		uint8_t *value = zfs_btree_get(tree, where);
 		uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
 		bcpy(value, tmp, size);
 		zfs_btree_bulk_finish(tree);
 		VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
 		kmem_free(tmp, size);
 		hdr = where->bti_node;
 		idx = where->bti_offset;
 	}
 
 	tree->bt_num_elems--;
 	/*
 	 * If the element happens to be in a core node, we move a leaf node's
 	 * element into its place and then remove the leaf node element. This
 	 * makes the rebalance logic not need to be recursive both upwards and
 	 * downwards.
 	 */
 	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
 		void *new_value = zfs_btree_last_helper(tree, left_subtree,
 		    where);
 		ASSERT3P(new_value, !=, NULL);
 
 		bcpy(new_value, node->btc_elems + idx * size, size);
 
 		hdr = where->bti_node;
 		idx = where->bti_offset;
 		ASSERT(!where->bti_before);
 	}
 
 	/*
 	 * First, we'll update the leaf's metadata. Then, we shift any
 	 * elements after the idx to the left. After that, we rebalance if
 	 * needed.
 	 */
 	ASSERT(!zfs_btree_is_core(hdr));
 	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 	ASSERT3U(hdr->bth_count, >, 0);
 
 	uint32_t min_count = (tree->bt_leaf_cap / 2) - 1;
 
 	/*
 	 * If we're over the minimum size or this is the root, just overwrite
 	 * the value and return.
 	 */
 	if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
 		bt_shrink_leaf(tree, leaf, idx, 1);
 		if (hdr->bth_parent == NULL) {
 			ASSERT0(tree->bt_height);
 			if (hdr->bth_count == 0) {
 				tree->bt_root = NULL;
 				tree->bt_height--;
 				zfs_btree_node_destroy(tree, &leaf->btl_hdr);
 			}
 		}
 		zfs_btree_verify(tree);
 		return;
 	}
 	ASSERT3U(hdr->bth_count, ==, min_count);
 
 	/*
 	 * Now we try to take a node from a sibling. We check left, then
 	 * right. If they exist and have more than the minimum number of
 	 * elements, we move the separator between us and them to our node
 	 * and move their closest element (last for left, first for right) to
 	 * the separator. Along the way we need to collapse the gap made by
 	 * idx, and (for our right neighbor) the gap made by removing their
 	 * first element.
 	 *
 	 * Note: this logic currently doesn't support taking from a neighbor
 	 * that isn't a sibling. This isn't critical functionality, but may be
 	 * worth implementing in the future for completeness' sake.
 	 */
 	zfs_btree_core_t *parent = hdr->bth_parent;
 	uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
 
 	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
 	    parent->btc_children[parent_idx - 1]);
 	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
 		/* We can take a node from the left neighbor. */
 		ASSERT(!zfs_btree_is_core(l_hdr));
 		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr;
 
 		/*
 		 * Move our elements back by one spot to make room for the
 		 * stolen element and overwrite the element being removed.
 		 */
 		bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT);
 
 		/* Move the separator to our first spot. */
 		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
 		    size;
 		bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size);
 
 		/* Move our neighbor's last element to the separator. */
 		uint8_t *take_elem = neighbor->btl_elems +
 		    (l_hdr->bth_first + l_hdr->bth_count - 1) * size;
 		bcpy(take_elem, separator, size);
 
 		/* Delete our neighbor's last element. */
 		bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1);
 		zfs_btree_verify(tree);
 		return;
 	}
 
 	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
 	    NULL : parent->btc_children[parent_idx + 1]);
 	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
 		/* We can take a node from the right neighbor. */
 		ASSERT(!zfs_btree_is_core(r_hdr));
 		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
 
 		/*
 		 * Move our elements after the element being removed forwards
 		 * by one spot to make room for the stolen element and
 		 * overwrite the element being removed.
 		 */
 		bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1,
 		    1, BSD_LEFT);
 
 		/* Move the separator between us to our last spot. */
 		uint8_t *separator = parent->btc_elems + parent_idx * size;
 		bcpy(separator, leaf->btl_elems + (hdr->bth_first +
 		    hdr->bth_count - 1) * size, size);
 
 		/* Move our neighbor's first element to the separator. */
 		uint8_t *take_elem = neighbor->btl_elems +
 		    r_hdr->bth_first * size;
 		bcpy(take_elem, separator, size);
 
 		/* Delete our neighbor's first element. */
 		bt_shrink_leaf(tree, neighbor, 0, 1);
 		zfs_btree_verify(tree);
 		return;
 	}
 
 	/*
 	 * In this case, neither of our neighbors can spare an element, so we
 	 * need to merge with one of them. We prefer the left one, arbitrarily.
 	 * After remove we move the separator into the leftmost merging node
 	 * (which may be us or the left neighbor), and then move the right
 	 * merging node's elements. Once that's done, we go back and delete
 	 * the element we're removing. Finally, go into the parent and delete
 	 * the right merging node and the separator. This may cause further
 	 * merging.
 	 */
 	zfs_btree_hdr_t *rm_hdr, *k_hdr;
 	if (l_hdr != NULL) {
 		k_hdr = l_hdr;
 		rm_hdr = hdr;
 	} else {
 		ASSERT3P(r_hdr, !=, NULL);
 		k_hdr = hdr;
 		rm_hdr = r_hdr;
 		parent_idx++;
 	}
 	ASSERT(!zfs_btree_is_core(k_hdr));
 	ASSERT(!zfs_btree_is_core(rm_hdr));
 	ASSERT3U(k_hdr->bth_count, ==, min_count);
 	ASSERT3U(rm_hdr->bth_count, ==, min_count);
 	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr;
 	zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
 
 	if (zfs_btree_verify_intensity >= 5) {
 		for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) {
 			zfs_btree_verify_poison_at(tree, k_hdr,
 			    k_hdr->bth_count + i);
 		}
 	}
 
 	/*
 	 * Remove the value from the node.  It will go below the minimum,
 	 * but we'll fix it in no time.
 	 */
 	bt_shrink_leaf(tree, leaf, idx, 1);
 
 	/* Prepare space for elements to be moved from the right. */
 	uint32_t k_count = k_hdr->bth_count;
 	bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count);
 	ASSERT3U(k_hdr->bth_count, ==, min_count * 2);
 
 	/* Move the separator into the first open spot. */
 	uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size;
 	uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size;
 	bcpy(separator, out, size);
 
 	/* Move our elements to the left neighbor. */
 	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
 
 	/* Remove the emptied node from the parent. */
 	zfs_btree_remove_from_node(tree, parent, rm_hdr);
 	zfs_btree_node_destroy(tree, rm_hdr);
 	zfs_btree_verify(tree);
 }
 
 /* Remove the given value from the tree. */
 void
 zfs_btree_remove(zfs_btree_t *tree, const void *value)
 {
 	zfs_btree_index_t where = {0};
 	VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
 	zfs_btree_remove_idx(tree, &where);
 }
 
 /* Return the number of elements in the tree. */
 ulong_t
 zfs_btree_numnodes(zfs_btree_t *tree)
 {
 	return (tree->bt_num_elems);
 }
 
 /*
  * This function is used to visit all the elements in the tree before
  * destroying the tree. This allows the calling code to perform any cleanup it
  * needs to do. This is more efficient than just removing the first element
  * over and over, because it removes all rebalancing. Once the destroy_nodes()
  * function has been called, no other btree operations are valid until it
  * returns NULL, which point the only valid operation is zfs_btree_destroy().
  *
  * example:
  *
  *      zfs_btree_index_t *cookie = NULL;
  *      my_data_t *node;
  *
  *      while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
  *              free(node->ptr);
  *      zfs_btree_destroy(tree);
  *
  */
 void *
 zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
 {
 	if (*cookie == NULL) {
 		if (tree->bt_height == -1)
 			return (NULL);
 		*cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
 		return (zfs_btree_first(tree, *cookie));
 	}
 
 	void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
 	    zfs_btree_node_destroy);
 	if (rval == NULL)   {
 		tree->bt_root = NULL;
 		tree->bt_height = -1;
 		tree->bt_num_elems = 0;
 		kmem_free(*cookie, sizeof (**cookie));
 		tree->bt_bulk = NULL;
 	}
 	return (rval);
 }
 
 static void
 zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	if (zfs_btree_is_core(hdr)) {
 		zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
 		for (uint32_t i = 0; i <= hdr->bth_count; i++)
 			zfs_btree_clear_helper(tree, btc->btc_children[i]);
 	}
 
 	zfs_btree_node_destroy(tree, hdr);
 }
 
 void
 zfs_btree_clear(zfs_btree_t *tree)
 {
 	if (tree->bt_root == NULL) {
 		ASSERT0(tree->bt_num_elems);
 		return;
 	}
 
 	zfs_btree_clear_helper(tree, tree->bt_root);
 	tree->bt_num_elems = 0;
 	tree->bt_root = NULL;
 	tree->bt_num_nodes = 0;
 	tree->bt_height = -1;
 	tree->bt_bulk = NULL;
 }
 
 void
 zfs_btree_destroy(zfs_btree_t *tree)
 {
 	ASSERT0(tree->bt_num_elems);
 	ASSERT3P(tree->bt_root, ==, NULL);
 }
 
 /* Verify that every child of this node has the correct parent pointer. */
 static void
 zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	if (!zfs_btree_is_core(hdr))
 		return;
 
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 		VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
 		zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
 	}
 }
 
 /* Verify that every node has the correct parent pointer. */
 static void
 zfs_btree_verify_pointers(zfs_btree_t *tree)
 {
 	if (tree->bt_height == -1) {
 		VERIFY3P(tree->bt_root, ==, NULL);
 		return;
 	}
 	VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
 	zfs_btree_verify_pointers_helper(tree, tree->bt_root);
 }
 
 /*
  * Verify that all the current node and its children satisfy the count
  * invariants, and return the total count in the subtree rooted in this node.
  */
 static uint64_t
 zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	if (!zfs_btree_is_core(hdr)) {
 		if (tree->bt_root != hdr && tree->bt_bulk &&
 		    hdr != &tree->bt_bulk->btl_hdr) {
 			VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1);
 		}
 
 		return (hdr->bth_count);
 	} else {
 
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		uint64_t ret = hdr->bth_count;
 		if (tree->bt_root != hdr && tree->bt_bulk == NULL)
 			VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
 		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 			ret += zfs_btree_verify_counts_helper(tree,
 			    node->btc_children[i]);
 		}
 
 		return (ret);
 	}
 }
 
 /*
  * Verify that all nodes satisfy the invariants and that the total number of
  * elements is correct.
  */
 static void
 zfs_btree_verify_counts(zfs_btree_t *tree)
 {
 	EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
 	if (tree->bt_height == -1) {
 		return;
 	}
 	VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
 	    tree->bt_num_elems);
 }
 
 /*
  * Check that the subtree rooted at this node has a uniform height. Returns
  * the number of nodes under this node, to help verify bt_num_nodes.
  */
 static uint64_t
 zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
     int32_t height)
 {
 	if (!zfs_btree_is_core(hdr)) {
 		VERIFY0(height);
 		return (1);
 	}
 
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 	uint64_t ret = 1;
 	for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 		ret += zfs_btree_verify_height_helper(tree,
 		    node->btc_children[i], height - 1);
 	}
 	return (ret);
 }
 
 /*
  * Check that the tree rooted at this node has a uniform height, and that the
  * bt_height in the tree is correct.
  */
 static void
 zfs_btree_verify_height(zfs_btree_t *tree)
 {
 	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
 	if (tree->bt_height == -1) {
 		return;
 	}
 
 	VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
 	    tree->bt_height), ==, tree->bt_num_nodes);
 }
 
 /*
  * Check that the elements in this node are sorted, and that if this is a core
  * node, the separators are properly between the subtrees they separaate and
  * that the children also satisfy this requirement.
  */
 static void
 zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	size_t size = tree->bt_elem_size;
 	if (!zfs_btree_is_core(hdr)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 		for (uint32_t i = 1; i < hdr->bth_count; i++) {
 			VERIFY3S(tree->bt_compar(leaf->btl_elems +
 			    (hdr->bth_first + i - 1) * size,
 			    leaf->btl_elems +
 			    (hdr->bth_first + i) * size), ==, -1);
 		}
 		return;
 	}
 
 	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 	for (uint32_t i = 1; i < hdr->bth_count; i++) {
 		VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
 		    node->btc_elems + i * size), ==, -1);
 	}
 	for (uint32_t i = 0; i < hdr->bth_count; i++) {
 		uint8_t *left_child_last = NULL;
 		zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
 		if (zfs_btree_is_core(left_child_hdr)) {
 			zfs_btree_core_t *left_child =
 			    (zfs_btree_core_t *)left_child_hdr;
 			left_child_last = left_child->btc_elems +
 			    (left_child_hdr->bth_count - 1) * size;
 		} else {
 			zfs_btree_leaf_t *left_child =
 			    (zfs_btree_leaf_t *)left_child_hdr;
 			left_child_last = left_child->btl_elems +
 			    (left_child_hdr->bth_first +
 			    left_child_hdr->bth_count - 1) * size;
 		}
 		int comp = tree->bt_compar(node->btc_elems + i * size,
 		    left_child_last);
 		if (comp <= 0) {
 			panic("btree: compar returned %d (expected 1) at "
 			    "%px %d: compar(%px,  %px)", comp, node, i,
 			    node->btc_elems + i * size, left_child_last);
 		}
 
 		uint8_t *right_child_first = NULL;
 		zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
 		if (zfs_btree_is_core(right_child_hdr)) {
 			zfs_btree_core_t *right_child =
 			    (zfs_btree_core_t *)right_child_hdr;
 			right_child_first = right_child->btc_elems;
 		} else {
 			zfs_btree_leaf_t *right_child =
 			    (zfs_btree_leaf_t *)right_child_hdr;
 			right_child_first = right_child->btl_elems +
 			    right_child_hdr->bth_first * size;
 		}
 		comp = tree->bt_compar(node->btc_elems + i * size,
 		    right_child_first);
 		if (comp >= 0) {
 			panic("btree: compar returned %d (expected -1) at "
 			    "%px %d: compar(%px,  %px)", comp, node, i,
 			    node->btc_elems + i * size, right_child_first);
 		}
 	}
 	for (uint32_t i = 0; i <= hdr->bth_count; i++)
 		zfs_btree_verify_order_helper(tree, node->btc_children[i]);
 }
 
 /* Check that all elements in the tree are in sorted order. */
 static void
 zfs_btree_verify_order(zfs_btree_t *tree)
 {
 	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
 	if (tree->bt_height == -1) {
 		return;
 	}
 
 	zfs_btree_verify_order_helper(tree, tree->bt_root);
 }
 
 #ifdef ZFS_DEBUG
 /* Check that all unused memory is poisoned correctly. */
 static void
 zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
 {
 	size_t size = tree->bt_elem_size;
 	if (!zfs_btree_is_core(hdr)) {
 		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
 		for (size_t i = 0; i < hdr->bth_first * size; i++)
 			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
 		size_t esize = tree->bt_leaf_size -
 		    offsetof(zfs_btree_leaf_t, btl_elems);
 		for (size_t i = (hdr->bth_first + hdr->bth_count) * size;
 		    i < esize; i++)
 			VERIFY3U(leaf->btl_elems[i], ==, 0x0f);
 	} else {
 		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
 		for (size_t i = hdr->bth_count * size;
 		    i < BTREE_CORE_ELEMS * size; i++)
 			VERIFY3U(node->btc_elems[i], ==, 0x0f);
 
 		for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS;
 		    i++) {
 			VERIFY3P(node->btc_children[i], ==,
 			    (zfs_btree_hdr_t *)BTREE_POISON);
 		}
 
 		for (uint32_t i = 0; i <= hdr->bth_count; i++) {
 			zfs_btree_verify_poison_helper(tree,
 			    node->btc_children[i]);
 		}
 	}
 }
 #endif
 
 /* Check that unused memory in the tree is still poisoned. */
 static void
 zfs_btree_verify_poison(zfs_btree_t *tree)
 {
 #ifdef ZFS_DEBUG
 	if (tree->bt_height == -1)
 		return;
 	zfs_btree_verify_poison_helper(tree, tree->bt_root);
 #endif
 }
 
 void
 zfs_btree_verify(zfs_btree_t *tree)
 {
 	if (zfs_btree_verify_intensity == 0)
 		return;
 	zfs_btree_verify_height(tree);
 	if (zfs_btree_verify_intensity == 1)
 		return;
 	zfs_btree_verify_pointers(tree);
 	if (zfs_btree_verify_intensity == 2)
 		return;
 	zfs_btree_verify_counts(tree);
 	if (zfs_btree_verify_intensity == 3)
 		return;
 	zfs_btree_verify_order(tree);
 
 	if (zfs_btree_verify_intensity == 4)
 		return;
 	zfs_btree_verify_poison(tree);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
 	"Enable btree verification. Levels above 4 require ZFS be built "
 	"with debugging");
-/* END CSTYLED */
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 137fe487a997..64924bc4fa61 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -1,266 +1,264 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018 by Delphix. All rights reserved.
  * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio_compress.h>
 
 static unsigned int ddt_zap_default_bs = 15;
 static unsigned int ddt_zap_default_ibs = 15;
 
 #define	DDT_ZAP_COMPRESS_BYTEORDER_MASK	0x80
 #define	DDT_ZAP_COMPRESS_FUNCTION_MASK	0x7f
 
 #define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
 
 static size_t
 ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
 {
 	uchar_t *version = dst++;
 	int cpfunc = ZIO_COMPRESS_ZLE;
 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 	size_t c_len;
 
 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
 
 	/* Call compress function directly to avoid hole detection. */
 	abd_t sabd, dabd;
 	abd_get_from_buf_struct(&sabd, (void *)src, s_len);
 	abd_get_from_buf_struct(&dabd, dst, d_len);
 	c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
 	abd_free(&dabd);
 	abd_free(&sabd);
 
 	if (c_len == s_len) {
 		cpfunc = ZIO_COMPRESS_OFF;
 		memcpy(dst, src, s_len);
 	}
 
 	*version = cpfunc;
 	if (ZFS_HOST_BYTEORDER)
 		*version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK;
 
 	return (c_len + 1);
 }
 
 static void
 ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 {
 	uchar_t version = *src++;
 	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
 
 	if (zio_compress_table[cpfunc].ci_decompress == NULL) {
 		memcpy(dst, src, d_len);
 		return;
 	}
 
 	abd_t sabd, dabd;
 	abd_get_from_buf_struct(&sabd, src, s_len);
 	abd_get_from_buf_struct(&dabd, dst, d_len);
 	VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
 	abd_free(&dabd);
 	abd_free(&sabd);
 
 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
 	    (ZFS_HOST_BYTEORDER != 0))
 		byteswap_uint64_array(dst, d_len);
 }
 
 static int
 ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
 {
 	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
 
 	if (prehash)
 		flags |= ZAP_FLAG_PRE_HASHED_KEY;
 
 	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
 	    ddt_zap_default_bs, ddt_zap_default_ibs,
 	    DMU_OT_NONE, 0, tx);
 	if (*objectp == 0)
 		return (SET_ERROR(ENOTSUP));
 
 	return (0);
 }
 
 static int
 ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	return (zap_destroy(os, object, tx));
 }
 
 static int
 ddt_zap_lookup(objset_t *os, uint64_t object,
     const ddt_key_t *ddk, void *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
 	int error;
 
 	error = zap_length_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, &one, &csize);
 	if (error)
 		return (error);
 
 	ASSERT3U(one, ==, 1);
 	ASSERT3U(csize, <=, psize + 1);
 
 	cbuf = kmem_alloc(csize, KM_SLEEP);
 
 	error = zap_lookup_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, 1, csize, cbuf);
 	if (error == 0)
 		ddt_zap_decompress(cbuf, phys, csize, psize);
 
 	kmem_free(cbuf, csize);
 
 	return (error);
 }
 
 static int
 ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk)
 {
 	return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS,
 	    NULL, NULL));
 }
 
 static void
 ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk)
 {
 	(void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS);
 }
 
 static void
 ddt_zap_prefetch_all(objset_t *os, uint64_t object)
 {
 	(void) zap_prefetch_object(os, object);
 }
 
 static int
 ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
     const void *phys, size_t psize, dmu_tx_t *tx)
 {
 	const size_t cbuf_size = psize + 1;
 
 	uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP);
 
 	uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size);
 
 	int error = zap_update_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, 1, csize, cbuf, tx);
 
 	kmem_free(cbuf, cbuf_size);
 
 	return (error);
 }
 
 static int
 ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
     dmu_tx_t *tx)
 {
 	return (zap_remove_uint64(os, object, (uint64_t *)ddk,
 	    DDT_KEY_WORDS, tx));
 }
 
 static int
 ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
     void *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int error;
 
 	za = zap_attribute_alloc();
 	if (*walk == 0) {
 		/*
 		 * We don't want to prefetch the entire ZAP object, because
 		 * it can be enormous.  Also the primary use of DDT iteration
 		 * is for scrubbing, in which case we will be issuing many
 		 * scrub I/Os for each ZAP block that we read in, so
 		 * reading the ZAP is unlikely to be the bottleneck.
 		 */
 		zap_cursor_init_noprefetch(&zc, os, object);
 	} else {
 		zap_cursor_init_serialized(&zc, os, object, *walk);
 	}
 	if ((error = zap_cursor_retrieve(&zc, za)) == 0) {
 		uint64_t csize = za->za_num_integers;
 
 		ASSERT3U(za->za_integer_length, ==, 1);
 		ASSERT3U(csize, <=, psize + 1);
 
 		uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP);
 
 		error = zap_lookup_uint64(os, object, (uint64_t *)za->za_name,
 		    DDT_KEY_WORDS, 1, csize, cbuf);
 		ASSERT0(error);
 		if (error == 0) {
 			ddt_zap_decompress(cbuf, phys, csize, psize);
 			*ddk = *(ddt_key_t *)za->za_name;
 		}
 
 		kmem_free(cbuf, csize);
 
 		zap_cursor_advance(&zc);
 		*walk = zap_cursor_serialize(&zc);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (error);
 }
 
 static int
 ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
 {
 	return (zap_count(os, object, count));
 }
 
 const ddt_ops_t ddt_zap_ops = {
 	"zap",
 	ddt_zap_create,
 	ddt_zap_destroy,
 	ddt_zap_lookup,
 	ddt_zap_contains,
 	ddt_zap_prefetch,
 	ddt_zap_prefetch_all,
 	ddt_zap_update,
 	ddt_zap_remove,
 	ddt_zap_walk,
 	ddt_zap_count,
 };
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
 	"DDT ZAP leaf blockshift");
 ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
 	"DDT ZAP indirect blockshift");
-/* END CSTYLED */
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 4830f4850a31..32609399b79e 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1,2951 +1,2949 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, 2023, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty checking for holes with lseek().
  * By default this is enabled to ensure accurate hole reporting, it can result
  * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
  * Disabling this option will result in holes never being reported in dirty
  * files which is always safe.
  */
 static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 #ifdef _ILP32
 uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 #else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Override copies= for dedup state objects. 0 means the traditional behaviour
  * (ie the default for the containing objset ie 3 for the MOS).
  */
 uint_t dmu_ddt_copies = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (newsize < 0 || newsize > db_fake->db_size)
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (!DMU_OT_IS_VALID(type))
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	int err;
 	uint32_t db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
     uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
 	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	if ((flags & DMU_READ_NO_DECRYPT) != 0)
 		dbuf_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
 		    >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (read)
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
 		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
 		    B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			if (zs) {
 				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
 				    B_TRUE);
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
 				zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/*
 		 * Initiate async demand data read.
 		 * We check the db_state after calling dbuf_read() because
 		 * (1) dbuf_read() may change the state to CACHED due to a
 		 * hit in the ARC, and (2) on a cache miss, a child will
 		 * have been added to "zio" but not yet completed, so the
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
 			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
 			    offset + length < db->db.db_offset +
 			    db->db.db_size) {
 				if (offset <= db->db.db_offset)
 					dbuf_flags |= DB_RF_PARTIAL_FIRST;
 				else
 					dbuf_flags |= DB_RF_PARTIAL_MORE;
 			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
 		}
 		dbp[i] = &db->db;
 	}
 
 	/*
 	 * If we are doing O_DIRECT we still hold the dbufs, even for reads,
 	 * but we do not issue any reads here. We do not want to account for
 	 * writes in this case.
 	 *
 	 * O_DIRECT write/read accounting takes place in
 	 * dmu_{write/read}_abd().
 	 */
 	if (!read && ((flags & DMU_DIRECTIO) == 0))
 		zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
 
 	if (zs)
 		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
 		/* wait for async read i/o */
 		err = zio_wait(zio);
 		if (err) {
 			dmu_buf_rele_array(dbp, nblks, tag);
 			return (err);
 		}
 
 		/* wait for other io to complete */
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read,
 	    tag, numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.  If the range
  * is too long, prefetch the first dmu_prefetch_max bytes as requested, while
  * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
  * should primarily help random reads, since for long sequential reads there is
  * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
  * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
 
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	int64_t level2 = level;
 	uint64_t start, end, start2, end2;
 
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift != 0) {
 		/*
 		 * The object has multiple blocks.  Calculate the full range
 		 * of blocks [start, end2) and then split it into two parts,
 		 * so that the first [start, end) fits into dmu_prefetch_max.
 		 */
 		start = dbuf_whichblock(dn, level, offset);
 		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
 		uint8_t ibs = dn->dn_indblkshift;
 		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
 		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
 		start2 = end = MIN(end2, start + limit);
 
 		/*
 		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
 			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
 		} while (end2 - start2 > limit);
 	} else {
 		/* There is only one block.  Prefetch it or nothing. */
 		start = start2 = end2 = 0;
 		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
 	for (uint64_t i = start; i < end; i++)
 		dbuf_prefetch(dn, level, i, pri, 0);
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 typedef struct {
 	kmutex_t	dpa_lock;
 	kcondvar_t	dpa_cv;
 	uint64_t	dpa_pending_io;
 } dmu_prefetch_arg_t;
 
 static void
 dmu_prefetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t issued)
 {
 	(void) level; (void) blkid; (void)issued;
 	dmu_prefetch_arg_t *dpa = arg;
 
 	ASSERT0(level);
 
 	mutex_enter(&dpa->dpa_lock);
 	ASSERT3U(dpa->dpa_pending_io, >, 0);
 	if (--dpa->dpa_pending_io == 0)
 		cv_broadcast(&dpa->dpa_cv);
 	mutex_exit(&dpa->dpa_lock);
 }
 
 static void
 dmu_prefetch_wait_by_dnode(dnode_t *dn, uint64_t offset, uint64_t len)
 {
 	dmu_prefetch_arg_t dpa;
 
 	mutex_init(&dpa.dpa_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dpa.dpa_cv, NULL, CV_DEFAULT, NULL);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	uint64_t start = dbuf_whichblock(dn, 0, offset);
 	uint64_t end = dbuf_whichblock(dn, 0, offset + len - 1) + 1;
 	dpa.dpa_pending_io = end - start;
 
 	for (uint64_t blk = start; blk < end; blk++) {
 		(void) dbuf_prefetch_impl(dn, 0, blk, ZIO_PRIORITY_ASYNC_READ,
 		    0, dmu_prefetch_done, &dpa);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for prefetch L0 reads to finish */
 	mutex_enter(&dpa.dpa_lock);
 	while (dpa.dpa_pending_io > 0) {
 		cv_wait(&dpa.dpa_cv, &dpa.dpa_lock);
 
 	}
 	mutex_exit(&dpa.dpa_lock);
 
 	mutex_destroy(&dpa.dpa_lock);
 	cv_destroy(&dpa.dpa_cv);
 }
 
 /*
  * Issue prefetch I/Os for the given L0 block range and wait for the I/O
  * to complete. This does not enforce dmu_prefetch_max and will prefetch
  * the entire range. The blocks are read from disk into the ARC but no
  * decompression occurs (i.e., the dbuf cache is not required).
  */
 int
 dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size)
 {
 	dnode_t *dn;
 	int err = 0;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * Chunk the requests (16 indirects worth) so that we can be interrupted
 	 */
 	uint64_t chunksize;
 	if (dn->dn_indblkshift) {
 		uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 		chunksize = (nbps * 16) << dn->dn_datablkshift;
 	} else {
 		chunksize = dn->dn_datablksz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, chunksize);
 
 		dmu_prefetch_wait_by_dnode(dn, offset, mylen);
 
 		offset += mylen;
 		size -= mylen;
 
 		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 	}
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 /*
  * Issue prefetch I/Os for the given object's dnode.
  */
 void
 dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 {
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return;
 
 	dnode_t *dn = DMU_META_DNODE(os);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
 	dbuf_prefetch(dn, 0, blkid, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crash in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/* dn_nlevels == 1 means we don't have any L1 blocks */
 	if (dn->dn_nlevels <= 1) {
 		*l1blks = 0;
 		*start = minimum;
 		return (0);
 	}
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #else
 	(void) os;
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block. If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
 	    zfs_dio_aligned(offset, size, PAGESIZE)) {
 		abd_t *data = abd_get_from_buf(buf, size);
 		err = dmu_read_abd(dn, offset, size, data, flags);
 		abd_free(data);
 		return (err);
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			ASSERT(db->db_data != NULL);
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_FALSE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		ASSERT(db->db_data != NULL);
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * This interface is not used internally by ZFS but is provided for
  * use by Lustre which is built on the DMU interfaces.
  */
 int
 dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int error;
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
 	    zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
 		abd_t *data = abd_get_from_buf((void *)buf, size);
 		error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx);
 		abd_free(data);
 		return (error);
 	}
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (0);
 }
 
 int
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0));
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
 	if (uio->uio_extflg & UIO_DIRECT)
 		return (dmu_read_uio_direct(dn, uio, size));
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
 		    UIO_READ, uio);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_read_uio_dnode(DB_DNODE(db), uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	uint64_t write_size;
 
 top:
 	write_size = size;
 
 	/*
 	 * We only allow Direct I/O writes to happen if we are block
 	 * sized aligned. Otherwise, we pass the write off to the ARC.
 	 */
 	if ((uio->uio_extflg & UIO_DIRECT) &&
 	    (write_size >= dn->dn_datablksz)) {
 		if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
 		    dn->dn_datablksz)) {
 			return (dmu_write_uio_direct(dn, uio, size, tx));
 		} else if (write_size > dn->dn_datablksz &&
 		    zfs_dio_offset_aligned(zfs_uio_offset(uio),
 		    dn->dn_datablksz)) {
 			write_size =
 			    dn->dn_datablksz * (write_size / dn->dn_datablksz);
 			err = dmu_write_uio_direct(dn, uio, write_size, tx);
 			if (err == 0) {
 				size -= write_size;
 				goto top;
 			} else {
 				return (err);
 			}
 		} else {
 			write_size =
 			    P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
 		}
 	}
 
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (int i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(write_size > 0);
 
 		offset_t off = zfs_uio_offset(uio);
 		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, write_size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx, B_TRUE);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
 		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
 			/* The fill was reverted.  Undo any uio progress. */
 			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
 		}
 
 		if (err)
 			break;
 
 		write_size -= tocpy;
 		size -= tocpy;
 	}
 
 	IMPLY(err == 0, write_size == 0);
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
 		goto top;
 	}
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 static void
 dmu_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	int cached_flags;
 
 	if (bps == NULL)
 		return;
 
 	for (size_t blk_off = 0; blk_off < nbps; blk_off++) {
 		blkptr_t *bp = &bps[blk_off];
 
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		cached_flags = arc_cached(spa, bp);
 		if (cached_flags == 0)
 			continue;
 
 		if ((cached_flags & (ARC_CACHED_IN_L1 | ARC_CACHED_IN_L2)) ==
 		    ARC_CACHED_IN_L2)
 			*l2sz += BP_GET_LSIZE(bp);
 		else
 			*l1sz += BP_GET_LSIZE(bp);
 	}
 }
 
 /*
  * Estimate DMU object cached size.
  */
 int
 dmu_object_cached_size(objset_t *os, uint64_t object,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	dnode_t *dn;
 	dmu_object_info_t doi;
 	int err = 0;
 
 	*l1sz = *l2sz = 0;
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return (0);
 
 	if (dn->dn_nlevels < 2) {
 		dnode_rele(dn, FTAG);
 		return (0);
 	}
 
 	dmu_object_info_from_dnode(dn, &doi);
 
 	for (uint64_t off = 0; off < doi.doi_max_offset;
 	    off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
 	 * Hold all valid L1 blocks, asking ARC the status of each BP
 	 * contained in each such L1 block.
 	 */
 	uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 	uint64_t l1blks = 1 + (dn->dn_maxblkid / nbps);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	for (uint64_t blk = 0; blk < l1blks; blk++) {
 		dmu_buf_impl_t *db = NULL;
 
 		if (issig()) {
 			/*
 			 * On interrupt, get out, and bubble up EINTR
 			 */
 			err = EINTR;
 			break;
 		}
 
 		/*
 		 * If we get an i/o error here, the L1 can't be read,
 		 * and nothing under it could be cached, so we just
 		 * continue. Ignoring the error from dbuf_hold_impl
 		 * or from dbuf_read is then a reasonable choice.
 		 */
 		err = dbuf_hold_impl(dn, 1, blk, B_TRUE, B_FALSE, FTAG, &db);
 		if (err != 0) {
 			/*
 			 * ignore error and continue
 			 */
 			err = 0;
 			continue;
 		}
 
 		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 		if (err == 0) {
 			dmu_cached_bps(dmu_objset_spa(os), db->db.db_data,
 			    nbps, l1sz, l2sz);
 		}
 		/*
 		 * error may be ignored, and we continue
 		 */
 		err = 0;
 		dbuf_rele(db, FTAG);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * A "lightweight" write is faster than a regular write (e.g.
  * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
  * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
  * data can not be read or overwritten until the transaction's txg has been
  * synced.  This makes it appropriate for workloads that are known to be
  * (temporarily) write-only, like "zfs receive".
  *
  * A single block is written, starting at the specified offset in bytes.  If
  * the call is successful, it returns 0 and the provided abd has been
  * consumed (the caller should not free it).
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
 	if (dr == NULL)
 		return (SET_ERROR(EIO));
 	dr->dt.dll.dr_abd = abd;
 	dr->dt.dll.dr_props = *zp;
 	dr->dt.dll.dr_flags = flags;
 	return (0);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		zfs_racct_write(os->os_spa, blksz, 1, 0);
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	int err;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 
 	if (zio->io_error == 0) {
 		dbuf_dirty_record_t *dr = dsa->dsa_dr;
 		blkptr_t *bp = zio->io_bp;
 
 		if (BP_IS_HOLE(bp)) {
 			dmu_buf_t *db = NULL;
 			if (dr)
 				db = &(dr->dr_dbuf->db);
 			else
 				db = dsa->dsa_zgd->zgd_db;
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zgd && zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		ASSERT0(dr->dt.dl.dr_has_raw_params);
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	if (dsa->dsa_done)
 		dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_free(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 	int error;
 
 	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
 	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	/*
 	 * This transaction does not produce any dirty data or log blocks, so
 	 * it should not be throttled.  All other cases wait for TXG sync, by
 	 * which time the log block we are writing will be obsolete, so we can
 	 * skip waiting and just return error here instead.
 	 */
 	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	if (dr_next != NULL) {
 		zp.zp_nopwrite = B_FALSE;
 	} else {
 		DB_DNODE_ENTER(db);
 		if (dnode_block_freed(DB_DNODE(db), db->db_blkid))
 			zp.zp_nopwrite = B_FALSE;
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT0(dr->dt.dl.dr_has_raw_params);
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
 	    dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
 	    dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
 	    &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
 			if (DMU_OT_IS_CRITICAL(type))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
 
 		if (dmu_ddt_copies > 0) {
 			/*
 			 * If this tuneable is set, and this is a write for a
 			 * dedup entry store (zap or log), then we treat it
 			 * something like ZFS_REDUNDANT_METADATA_MOST on a
 			 * regular dataset: this many copies, and one more for
 			 * "higher" indirect blocks. This specific exception is
 			 * necessary because dedup objects are stored in the
 			 * MOS, which always has the highest possible copies.
 			 */
 			dmu_object_type_t stype =
 			    dn ? dn->dn_storage_type : DMU_OT_NONE;
 			if (stype == DMU_OT_NONE)
 				stype = type;
 			if (stype == DMU_OT_DDT_ZAP) {
 				copies = dmu_ddt_copies;
 				if (level >=
 				    zfs_redundant_metadata_most_ditto_level)
 					copies++;
 			}
 		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 	zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * Reports the location of data and holes in an object.  In order to
  * accurately report holes all dirty data must be synced to disk.  This
  * causes extremely poor performance when seeking for holes in a dirty file.
  * As a compromise, only provide hole data when the dnode is clean.  When
  * a dnode is dirty report the dnode as having no holes by returning EBUSY
  * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
 		 * then hole reporting has been requested.  Dirty dnodes
 		 * must be synced to disk to accurately report holes.
 		 *
 		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
 		 * held by the caller only a single restart will be required.
 		 * We tolerate callers which do not hold the rangelock by
 		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 
 			if (restarted)
 				return (SET_ERROR(EBUSY));
 
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			restarted = 1;
 			goto restart;
 		}
 
 		err = SET_ERROR(EBUSY);
 	} else {
 		err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     blkptr_t *bps, size_t *nbpsp)
 {
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	blkptr_t *bp;
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
 		}
 		return (error);
 	}
 
 	ASSERT3U(numbufs, <=, *nbpsp);
 
 	for (int i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 
 		mutex_enter(&db->db_mtx);
 
 		if (!list_is_empty(&db->db_dirty_records)) {
 			dbuf_dirty_record_t *dr;
 
 			dr = list_head(&db->db_dirty_records);
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * This is very special case where we clone a
 				 * block and in the same transaction group we
 				 * read its BP (most likely to clone the clone).
 				 */
 				bp = &dr->dt.dl.dr_overridden_by;
 			} else {
 				/*
 				 * The block was modified in the same
 				 * transaction group.
 				 */
 				mutex_exit(&db->db_mtx);
 				error = SET_ERROR(EAGAIN);
 				goto out;
 			}
 		} else {
 			bp = db->db_blkptr;
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		if (bp == NULL) {
 			/*
 			 * The file size was increased, but the block was never
 			 * written, otherwise we would either have the block
 			 * pointer or the dirty record and would not get here.
 			 * It is effectively a hole, so report it as such.
 			 */
 			BP_ZERO(&bps[i]);
 			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
 		 */
 		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * If the block was allocated in transaction group that is not
 		 * yet synced, we could clone it, but we couldn't write this
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
 		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 
 		bps[i] = *bp;
 	}
 
 	*nbpsp = numbufs;
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 	const blkptr_t *bp;
 	int error = 0, i, numbufs;
 
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
 	 * Before we start cloning make sure that the dbufs sizes match new BPs
 	 * sizes. If they don't, that's a no-go, as we are not able to shrink
 	 * dbufs.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 
 		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
 			error = SET_ERROR(EXDEV);
 			goto out;
 		}
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		dmu_buf_will_clone_or_dio(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
 		dr = list_head(&db->db_dirty_records);
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		ASSERT0(dl->dr_has_raw_params);
 		dl->dr_overridden_by = *bp;
 		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
 				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
 				    BP_GET_BIRTH(bp));
 			} else {
 				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
 				    dr->dr_txg);
 			}
 		}
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	*dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 	(void) vbuf, (void) size;
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_prefetch_by_dnode);
 EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_read_uio);
 EXPORT_SYMBOL(dmu_read_uio_dbuf);
 EXPORT_SYMBOL(dmu_read_uio_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_write_by_dnode_flags);
 EXPORT_SYMBOL(dmu_write_uio);
 EXPORT_SYMBOL(dmu_write_uio_dbuf);
 EXPORT_SYMBOL(dmu_write_uio_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
 	"Override copies= for dedup objects");
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c
index 56986ea43446..344b0e3750e9 100644
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -1,525 +1,523 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  */
 
 #include <sys/dbuf.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_dataset.h>
 
 /*
  * Each of the concurrent object allocators will grab
  * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
  * grab 128 slots, which is 4 blocks worth.  This was experimentally
  * determined to be the lowest value that eliminates the measurable effect
  * of lock contention from this code path.
  */
 uint_t dmu_object_alloc_chunk_shift = 7;
 
 static uint64_t
 dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	uint64_t object;
 	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
 	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
 	int dn_slots = dnodesize >> DNODE_SHIFT;
 	boolean_t restarted = B_FALSE;
 	uint64_t *cpuobj = NULL;
 	uint_t dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
 	int error;
 
 	cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
 	    os->os_obj_next_percpu_len];
 
 	if (dn_slots == 0) {
 		dn_slots = DNODE_MIN_SLOTS;
 	} else {
 		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
 		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
 	}
 
 	/*
 	 * The "chunk" of dnodes that is assigned to a CPU-specific
 	 * allocator needs to be at least one block's worth, to avoid
 	 * lock contention on the dbuf.  It can be at most one L1 block's
 	 * worth, so that the "rescan after polishing off a L1's worth"
 	 * logic below will be sure to kick in.
 	 */
 	if (dnodes_per_chunk < DNODES_PER_BLOCK)
 		dnodes_per_chunk = DNODES_PER_BLOCK;
 	if (dnodes_per_chunk > L1_dnode_count)
 		dnodes_per_chunk = L1_dnode_count;
 
 	/*
 	 * The caller requested the dnode be returned as a performance
 	 * optimization in order to avoid releasing the hold only to
 	 * immediately reacquire it.  Since they caller is responsible
 	 * for releasing the hold they must provide the tag.
 	 */
 	if (allocated_dnode != NULL) {
 		ASSERT3P(tag, !=, NULL);
 	} else {
 		ASSERT3P(tag, ==, NULL);
 		tag = FTAG;
 	}
 
 	object = *cpuobj;
 	for (;;) {
 		/*
 		 * If we finished a chunk of dnodes, get a new one from
 		 * the global allocator.
 		 */
 		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
 		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
 		    dn_slots)) {
 			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
 			mutex_enter(&os->os_obj_lock);
 			ASSERT0(P2PHASE(os->os_obj_next_chunk,
 			    dnodes_per_chunk));
 			object = os->os_obj_next_chunk;
 
 			/*
 			 * Each time we polish off a L1 bp worth of dnodes
 			 * (2^12 objects), move to another L1 bp that's
 			 * still reasonably sparse (at most 1/4 full). Look
 			 * from the beginning at most once per txg. If we
 			 * still can't allocate from that L1 block, search
 			 * for an empty L0 block, which will quickly skip
 			 * to the end of the metadnode if no nearby L0
 			 * blocks are empty. This fallback avoids a
 			 * pathology where full dnode blocks containing
 			 * large dnodes appear sparse because they have a
 			 * low blk_fill, leading to many failed allocation
 			 * attempts. In the long term a better mechanism to
 			 * search for sparse metadnode regions, such as
 			 * spacemaps, could be implemented.
 			 *
 			 * os_scan_dnodes is set during txg sync if enough
 			 * objects have been freed since the previous
 			 * rescan to justify backfilling again.
 			 *
 			 * Note that dmu_traverse depends on the behavior
 			 * that we use multiple blocks of the dnode object
 			 * before going back to reuse objects.  Any change
 			 * to this algorithm should preserve that property
 			 * or find another solution to the issues described
 			 * in traverse_visitbp.
 			 */
 			if (P2PHASE(object, L1_dnode_count) == 0) {
 				uint64_t offset;
 				uint64_t blkfill;
 				int minlvl;
 				if (os->os_rescan_dnodes) {
 					offset = 0;
 					os->os_rescan_dnodes = B_FALSE;
 				} else {
 					offset = object << DNODE_SHIFT;
 				}
 				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
 				minlvl = restarted ? 1 : 2;
 				restarted = B_TRUE;
 				error = dnode_next_offset(DMU_META_DNODE(os),
 				    DNODE_FIND_HOLE, &offset, minlvl,
 				    blkfill, 0);
 				if (error == 0) {
 					object = offset >> DNODE_SHIFT;
 				}
 			}
 			/*
 			 * Note: if "restarted", we may find a L0 that
 			 * is not suitably aligned.
 			 */
 			os->os_obj_next_chunk =
 			    P2ALIGN_TYPED(object, dnodes_per_chunk, uint64_t) +
 			    dnodes_per_chunk;
 			(void) atomic_swap_64(cpuobj, object);
 			mutex_exit(&os->os_obj_lock);
 		}
 
 		/*
 		 * The value of (*cpuobj) before adding dn_slots is the object
 		 * ID assigned to us.  The value afterwards is the object ID
 		 * assigned to whoever wants to do an allocation next.
 		 */
 		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
 
 		/*
 		 * XXX We should check for an i/o error here and return
 		 * up to our caller.  Actually we should pre-read it in
 		 * dmu_tx_assign(), but there is currently no mechanism
 		 * to do so.
 		 */
 		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
 		    dn_slots, tag, &dn);
 		if (error == 0) {
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 			/*
 			 * Another thread could have allocated it; check
 			 * again now that we have the struct lock.
 			 */
 			if (dn->dn_type == DMU_OT_NONE) {
 				dnode_allocate(dn, ot, blocksize,
 				    indirect_blockshift, bonustype,
 				    bonuslen, dn_slots, tx);
 				rw_exit(&dn->dn_struct_rwlock);
 				dmu_tx_add_new_object(tx, dn);
 
 				/*
 				 * Caller requested the allocated dnode be
 				 * returned and is responsible for the hold.
 				 */
 				if (allocated_dnode != NULL)
 					*allocated_dnode = dn;
 				else
 					dnode_rele(dn, tag);
 
 				return (object);
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, tag);
 			DNODE_STAT_BUMP(dnode_alloc_race);
 		}
 
 		/*
 		 * Skip to next known valid starting point on error.  This
 		 * is the start of the next block of dnodes.
 		 */
 		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
 			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
 			DNODE_STAT_BUMP(dnode_alloc_next_block);
 		}
 		(void) atomic_swap_64(cpuobj, object);
 	}
 }
 
 uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
 	    bonuslen, 0, NULL, NULL, tx);
 }
 
 uint64_t
 dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     dmu_tx_t *tx)
 {
 	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
 	    bonustype, bonuslen, 0, NULL, NULL, tx);
 }
 
 uint64_t
 dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
 	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
 	    bonuslen, dnodesize, NULL, NULL, tx));
 }
 
 /*
  * Allocate a new object and return a pointer to the newly allocated dnode
  * via the allocated_dnode argument.  The returned dnode will be held and
  * the caller is responsible for releasing the hold by calling dnode_rele().
  */
 uint64_t
 dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
 	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
 }
 
 int
 dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
 	    bonuslen, 0, tx));
 }
 
 int
 dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int dn_slots = dnodesize >> DNODE_SHIFT;
 	int err;
 
 	if (dn_slots == 0)
 		dn_slots = DNODE_MIN_SLOTS;
 	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
 	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
 
 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (SET_ERROR(EBADF));
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
 	    FTAG, &dn);
 	if (err)
 		return (err);
 
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
 	dmu_tx_add_new_object(tx, dn);
 
 	dnode_rele(dn, FTAG);
 
 	return (0);
 }
 
 int
 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
 	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
 }
 
 int
 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
     boolean_t keep_spill, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int dn_slots = dnodesize >> DNODE_SHIFT;
 	int err;
 
 	if (dn_slots == 0)
 		dn_slots = DNODE_MIN_SLOTS;
 
 	if (object == DMU_META_DNODE_OBJECT)
 		return (SET_ERROR(EBADF));
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
 	    FTAG, &dn);
 	if (err)
 		return (err);
 
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
 	    keep_spill, tx);
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
 	    FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		dbuf_rm_spill(dn, tx);
 		dnode_rm_spill(dn, tx);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
 	    FTAG, &dn);
 	if (err)
 		return (err);
 
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 	/*
 	 * If we don't create this free range, we'll leak indirect blocks when
 	 * we get to freeing the dnode in syncing context.
 	 */
 	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 	dnode_free(dn, tx);
 	dnode_rele(dn, FTAG);
 
 	return (0);
 }
 
 /*
  * Return (in *objectp) the next object which is allocated (or a hole)
  * after *object, taking into account only objects that may have been modified
  * after the specified txg.
  */
 int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
 	uint64_t offset;
 	uint64_t start_obj;
 	struct dsl_dataset *ds = os->os_dsl_dataset;
 	int error;
 
 	if (*objectp == 0) {
 		start_obj = 1;
 	} else if (ds && dsl_dataset_feature_is_active(ds,
 	    SPA_FEATURE_LARGE_DNODE)) {
 		uint64_t i = *objectp + 1;
 		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
 		dmu_object_info_t doi;
 
 		/*
 		 * Scan through the remaining meta dnode block.  The contents
 		 * of each slot in the block are known so it can be quickly
 		 * checked.  If the block is exhausted without a match then
 		 * hand off to dnode_next_offset() for further scanning.
 		 */
 		while (i <= last_obj) {
 			if (i == 0)
 				return (SET_ERROR(ESRCH));
 			error = dmu_object_info(os, i, &doi);
 			if (error == ENOENT) {
 				if (hole) {
 					*objectp = i;
 					return (0);
 				} else {
 					i++;
 				}
 			} else if (error == EEXIST) {
 				i++;
 			} else if (error == 0) {
 				if (hole) {
 					i += doi.doi_dnodesize >> DNODE_SHIFT;
 				} else {
 					*objectp = i;
 					return (0);
 				}
 			} else {
 				return (error);
 			}
 		}
 
 		start_obj = i;
 	} else {
 		start_obj = *objectp + 1;
 	}
 
 	offset = start_obj << DNODE_SHIFT;
 
 	error = dnode_next_offset(DMU_META_DNODE(os),
 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
 	*objectp = offset >> DNODE_SHIFT;
 
 	return (error);
 }
 
 /*
  * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
  * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
  *
  * Only for use from syncing context, on MOS objects.
  */
 void
 dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
 	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
 		dnode_rele(dn, FTAG);
 		return;
 	}
 	ASSERT3U(dn->dn_type, ==, old_type);
 	ASSERT0(dn->dn_maxblkid);
 
 	/*
 	 * We must initialize the ZAP data before changing the type,
 	 * so that concurrent calls to *_is_zapified() can determine if
 	 * the object has been completely zapified by checking the type.
 	 */
 	mzap_create_impl(dn, 0, 0, tx);
 
 	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
 	    DMU_OTN_ZAP_METADATA;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 
 	spa_feature_incr(dmu_objset_spa(mos),
 	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
 }
 
 void
 dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	dmu_object_type_t t;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
 	t = dn->dn_type;
 	dnode_rele(dn, FTAG);
 
 	if (t == DMU_OTN_ZAP_METADATA) {
 		spa_feature_decr(dmu_objset_spa(mos),
 		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
 	}
 	VERIFY0(dmu_object_free(mos, object, tx));
 }
 
 EXPORT_SYMBOL(dmu_object_alloc);
 EXPORT_SYMBOL(dmu_object_alloc_ibs);
 EXPORT_SYMBOL(dmu_object_alloc_dnsize);
 EXPORT_SYMBOL(dmu_object_alloc_hold);
 EXPORT_SYMBOL(dmu_object_claim);
 EXPORT_SYMBOL(dmu_object_claim_dnsize);
 EXPORT_SYMBOL(dmu_object_reclaim);
 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
 EXPORT_SYMBOL(dmu_object_rm_spill);
 EXPORT_SYMBOL(dmu_object_free);
 EXPORT_SYMBOL(dmu_object_next);
 EXPORT_SYMBOL(dmu_object_zapify);
 EXPORT_SYMBOL(dmu_object_free_zapified);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, UINT, ZMOD_RW,
 	"CPU-specific allocator grabs 2^N objects at once");
-/* END CSTYLED */
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index b1cd981cec1d..a33216be6ecf 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1,3846 +1,3845 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, 2024, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2022 Axcient.
  */
 
 #include <sys/arc.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zvol.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/zfeature.h>
 #include <sys/bqueue.h>
 #include <sys/objlist.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/zfs_file.h>
 
 static uint_t zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
 static uint_t zfs_recv_queue_ff = 20;
 static uint_t zfs_recv_write_batch_size = 1024 * 1024;
 static int zfs_recv_best_effort_corrective = 0;
 
 static const void *const dmu_recv_tag = "dmu_recv_tag";
 const char *const recv_clone_name = "%recv";
 
 typedef enum {
 	ORNS_NO,
 	ORNS_YES,
 	ORNS_MAYBE
 } or_need_sync_t;
 
 static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
     void *buf);
 
 struct receive_record_arg {
 	dmu_replay_record_t header;
 	void *payload; /* Pointer to a buffer containing the payload */
 	/*
 	 * If the record is a WRITE or SPILL, pointer to the abd containing the
 	 * payload.
 	 */
 	abd_t *abd;
 	int payload_size;
 	uint64_t bytes_read; /* bytes read from stream when record created */
 	boolean_t eos_marker; /* Marks the end of the stream */
 	bqueue_node_t node;
 };
 
 struct receive_writer_arg {
 	objset_t *os;
 	boolean_t byteswap;
 	bqueue_t q;
 
 	/*
 	 * These three members are used to signal to the main thread when
 	 * we're done.
 	 */
 	kmutex_t mutex;
 	kcondvar_t cv;
 	boolean_t done;
 
 	int err;
 	const char *tofs;
 	boolean_t heal;
 	boolean_t resumable;
 	boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
 	boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
 	boolean_t full;  /* this is a full send stream */
 	uint64_t last_object;
 	uint64_t last_offset;
 	uint64_t max_object; /* highest object ID referenced in stream */
 	uint64_t bytes_read; /* bytes read when current record created */
 
 	list_t write_batch;
 
 	/* Encryption parameters for the last received DRR_OBJECT_RANGE */
 	boolean_t or_crypt_params_present;
 	uint64_t or_firstobj;
 	uint64_t or_numslots;
 	uint8_t or_salt[ZIO_DATA_SALT_LEN];
 	uint8_t or_iv[ZIO_DATA_IV_LEN];
 	uint8_t or_mac[ZIO_DATA_MAC_LEN];
 	boolean_t or_byteorder;
 	zio_t *heal_pio;
 
 	/* Keep track of DRR_FREEOBJECTS right after DRR_OBJECT_RANGE */
 	or_need_sync_t or_need_sync;
 };
 
 typedef struct dmu_recv_begin_arg {
 	const char *drba_origin;
 	dmu_recv_cookie_t *drba_cookie;
 	cred_t *drba_cred;
 	proc_t *drba_proc;
 	dsl_crypto_params_t *drba_dcp;
 } dmu_recv_begin_arg_t;
 
 static void
 byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
 		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
 		DO64(drr_begin.drr_toguid);
 		DO64(drr_begin.drr_fromguid);
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
 		DO32(drr_object.drr_raw_bonuslen);
 		DO64(drr_object.drr_toguid);
 		DO64(drr_object.drr_maxblkid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
 		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_logical_size);
 		DO64(drr_write.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		DO64(drr_write.drr_compressed_size);
 		break;
 	case DRR_WRITE_EMBEDDED:
 		DO64(drr_write_embedded.drr_object);
 		DO64(drr_write_embedded.drr_offset);
 		DO64(drr_write_embedded.drr_length);
 		DO64(drr_write_embedded.drr_toguid);
 		DO32(drr_write_embedded.drr_lsize);
 		DO32(drr_write_embedded.drr_psize);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_SPILL:
 		DO64(drr_spill.drr_object);
 		DO64(drr_spill.drr_length);
 		DO64(drr_spill.drr_toguid);
 		DO64(drr_spill.drr_compressed_size);
 		DO32(drr_spill.drr_type);
 		break;
 	case DRR_OBJECT_RANGE:
 		DO64(drr_object_range.drr_firstobj);
 		DO64(drr_object_range.drr_numslots);
 		DO64(drr_object_range.drr_toguid);
 		break;
 	case DRR_REDACT:
 		DO64(drr_redact.drr_object);
 		DO64(drr_redact.drr_offset);
 		DO64(drr_redact.drr_length);
 		DO64(drr_redact.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_toguid);
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	default:
 		break;
 	}
 
 	if (drr->drr_type != DRR_BEGIN) {
 		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
 	}
 
 #undef DO64
 #undef DO32
 }
 
 static boolean_t
 redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
 {
 	for (int i = 0; i < num_snaps; i++) {
 		if (snaps[i] == guid)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Check that the new stream we're trying to receive is redacted with respect to
  * a subset of the snapshots that the origin was redacted with respect to.  For
  * the reasons behind this, see the man page on redacted zfs sends and receives.
  */
 static boolean_t
 compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
     uint64_t *redact_snaps, uint64_t num_redact_snaps)
 {
 	/*
 	 * Short circuit the comparison; if we are redacted with respect to
 	 * more snapshots than the origin, we can't be redacted with respect
 	 * to a subset.
 	 */
 	if (num_redact_snaps > origin_num_snaps) {
 		return (B_FALSE);
 	}
 
 	for (int i = 0; i < num_redact_snaps; i++) {
 		if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 		    redact_snaps[i])) {
 			return (B_FALSE);
 		}
 	}
 	return (B_TRUE);
 }
 
 static boolean_t
 redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
 {
 	uint64_t *origin_snaps;
 	uint64_t origin_num_snaps;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 	int err = 0;
 	boolean_t ret = B_TRUE;
 	uint64_t *redact_snaps;
 	uint_t numredactsnaps;
 
 	/*
 	 * If this is a full send stream, we're safe no matter what.
 	 */
 	if (drrb->drr_fromguid == 0)
 		return (ret);
 
 	VERIFY(dsl_dataset_get_uint64_array_feature(origin,
 	    SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
 
 	if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 	    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
 	    0) {
 		/*
 		 * If the send stream was sent from the redaction bookmark or
 		 * the redacted version of the dataset, then we're safe.  Verify
 		 * that this is from the a compatible redaction bookmark or
 		 * redacted dataset.
 		 */
 		if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
 		    redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		/*
 		 * If the stream is redacted, it must be redacted with respect
 		 * to a subset of what the origin is redacted with respect to.
 		 * See case number 2 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
 
 		if (err != 0 || !compatible_redact_snaps(origin_snaps,
 		    origin_num_snaps, redact_snaps, numredactsnaps)) {
 			err = EINVAL;
 		}
 	} else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
 	    drrb->drr_toguid)) {
 		/*
 		 * If the stream isn't redacted but the origin is, this must be
 		 * one of the snapshots the origin is redacted with respect to.
 		 * See case number 1 in the zfs man page section on redacted zfs
 		 * send.
 		 */
 		err = EINVAL;
 	}
 
 	if (err != 0)
 		ret = B_FALSE;
 	return (ret);
 }
 
 /*
  * If we previously received a stream with --large-block, we don't support
  * receiving an incremental on top of it without --large-block.  This avoids
  * forcing a read-modify-write or trying to re-aggregate a string of WRITE
  * records.
  */
 static int
 recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags)
 {
 	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) &&
 	    !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH));
 	return (0);
 }
 
 static int
 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
     uint64_t fromguid, uint64_t featureflags)
 {
 	uint64_t obj;
 	uint64_t children;
 	int error;
 	dsl_dataset_t *snap;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
 	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
 	boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
 
 	/* Temporary clone name must not exist. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
 	    8, 1, &obj);
 	if (error != ENOENT)
 		return (error == 0 ? SET_ERROR(EBUSY) : error);
 
 	/* Resume state must not be set. */
 	if (dsl_dataset_has_resume_receive_state(ds))
 		return (SET_ERROR(EBUSY));
 
 	/* New snapshot name must not exist if we're not healing it. */
 	error = zap_lookup(dp->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 	    drba->drba_cookie->drc_tosnap, 8, 1, &obj);
 	if (drba->drba_cookie->drc_heal) {
 		if (error != 0)
 			return (error);
 	} else if (error != ENOENT) {
 		return (error == 0 ? SET_ERROR(EEXIST) : error);
 	}
 
 	/* Must not have children if receiving a ZVOL. */
 	error = zap_count(dp->dp_meta_objset,
 	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
 	if (error != 0)
 		return (error);
 	if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
 	    children > 0)
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 
 	/*
 	 * Check snapshot limit before receiving. We'll recheck again at the
 	 * end, but might as well abort before receiving if we're already over
 	 * the limit.
 	 *
 	 * Note that we do not check the file system limit with
 	 * dsl_dir_fscount_check because the temporary %clones don't count
 	 * against that limit.
 	 */
 	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 	    NULL, drba->drba_cred, drba->drba_proc);
 	if (error != 0)
 		return (error);
 
 	if (drba->drba_cookie->drc_heal) {
 		/* Encryption is incompatible with embedded data. */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Healing is not supported when in 'force' mode. */
 		if (drba->drba_cookie->drc_force)
 			return (SET_ERROR(EINVAL));
 
 		/* Must have keys loaded if doing encrypted non-raw recv. */
 		if (encrypted && !raw) {
 			if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object,
 			    NULL, NULL) != 0)
 				return (SET_ERROR(EACCES));
 		}
 
 		error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * When not doing best effort corrective recv healing can only
 		 * be done if the send stream is for the same snapshot as the
 		 * one we are trying to heal.
 		 */
 		if (zfs_recv_best_effort_corrective == 0 &&
 		    drba->drba_cookie->drc_drrb->drr_toguid !=
 		    dsl_dataset_phys(snap)->ds_guid) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 		dsl_dataset_rele(snap, FTAG);
 	} else if (fromguid != 0) {
 		/* Sanity check the incremental recv */
 		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 
 		/* Can't perform a raw receive on top of a non-raw receive */
 		if (!encrypted && raw)
 			return (SET_ERROR(EINVAL));
 
 		/* Encryption is incompatible with embedded data */
 		if (encrypted && embed)
 			return (SET_ERROR(EINVAL));
 
 		/* Find snapshot in this dir that matches fromguid. */
 		while (obj != 0) {
 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
 			    &snap);
 			if (error != 0)
 				return (SET_ERROR(ENODEV));
 			if (snap->ds_dir != ds->ds_dir) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 				break;
 			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 			dsl_dataset_rele(snap, FTAG);
 		}
 		if (obj == 0)
 			return (SET_ERROR(ENODEV));
 
 		if (drba->drba_cookie->drc_force) {
 			drba->drba_cookie->drc_fromsnapobj = obj;
 		} else {
 			/*
 			 * If we are not forcing, there must be no
 			 * changes since fromsnap. Raw sends have an
 			 * additional constraint that requires that
 			 * no "noop" snapshots exist between fromsnap
 			 * and tosnap for the IVset checking code to
 			 * work properly.
 			 */
 			if (dsl_dataset_modified_since_snap(ds, snap) ||
 			    (raw &&
 			    dsl_dataset_phys(ds)->ds_prev_snap_obj !=
 			    snap->ds_object)) {
 				dsl_dataset_rele(snap, FTAG);
 				return (SET_ERROR(ETXTBSY));
 			}
 			drba->drba_cookie->drc_fromsnapobj =
 			    ds->ds_prev->ds_object;
 		}
 
 		if (dsl_dataset_feature_is_active(snap,
 		    SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
 		    snap)) {
 			dsl_dataset_rele(snap, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_check_large_blocks(snap, featureflags);
 		if (error != 0) {
 			dsl_dataset_rele(snap, FTAG);
 			return (error);
 		}
 
 		dsl_dataset_rele(snap, FTAG);
 	} else {
 		/* If full and not healing then must be forced. */
 		if (!drba->drba_cookie->drc_force)
 			return (SET_ERROR(EEXIST));
 
 		/*
 		 * We don't support using zfs recv -F to blow away
 		 * encrypted filesystems. This would require the
 		 * dsl dir to point to the old encryption key and
 		 * the new one at the same time during the receive.
 		 */
 		if ((!encrypted && raw) || encrypted)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Perform the same encryption checks we would if
 		 * we were creating a new dataset from scratch.
 		 */
 		if (!raw) {
 			boolean_t will_encrypt;
 
 			error = dmu_objset_create_crypt_check(
 			    ds->ds_dir->dd_parent, drba->drba_dcp,
 			    &will_encrypt);
 			if (error != 0)
 				return (error);
 
 			if (will_encrypt && embed)
 				return (SET_ERROR(EINVAL));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check that any feature flags used in the data stream we're receiving are
  * supported by the pool we are receiving into.
  *
  * Note that some of the features we explicitly check here have additional
  * (implicit) features they depend on, but those dependencies are enforced
  * through the zfeature_register() calls declaring the features that we
  * explicitly check.
  */
 static int
 recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
 {
 	/*
 	 * Check if there are any unsupported feature flags.
 	 */
 	if (!DMU_STREAM_SUPPORTED(featureflags)) {
 		return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
 	}
 
 	/* Verify pool version supports SA if SA_SPILL feature set */
 	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 	    spa_version(spa) < SPA_VERSION_SA)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks,
 	 * and large_dnodes in the stream can only be used if those pool
 	 * features are enabled because we don't attempt to decompress /
 	 * un-embed / un-mooch / split up the blocks / dnodes during the
 	 * receive process.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
 		return (SET_ERROR(ENOTSUP));
 	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Receiving redacted streams requires that redacted datasets are
 	 * enabled.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If the LONGNAME is not enabled on the target, fail that request.
 	 */
 	if ((featureflags & DMU_BACKUP_FEATURE_LONGNAME) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_LONGNAME))
 		return (SET_ERROR(ENOTSUP));
 
 	return (0);
 }
 
 static int
 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 	uint64_t fromguid = drrb->drr_fromguid;
 	int flags = drrb->drr_flags;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	const char *tofs = drba->drba_cookie->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 		return (SET_ERROR(EINVAL));
 
 	error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* Resumable receives require extensible datasets */
 	if (drba->drba_cookie->drc_resumable &&
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
 		return (SET_ERROR(ENOTSUP));
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require the encryption feature */
 		if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
 			return (SET_ERROR(ENOTSUP));
 
 		/* embedded data is incompatible with encryption and raw recv */
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 			return (SET_ERROR(EINVAL));
 
 		/* raw receives require spill block allocation flag */
 		if (!(flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		/*
 		 * We support unencrypted datasets below encrypted ones now,
 		 * so add the DS_HOLD_FLAG_DECRYPT flag only if we are dealing
 		 * with a dataset we may encrypt.
 		 */
 		if (drba->drba_dcp == NULL ||
 		    drba->drba_dcp->cp_crypt != ZIO_CRYPT_OFF) {
 			dsflags |= DS_HOLD_FLAG_DECRYPT;
 		}
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
 		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = recv_begin_check_existing_impl(drba, ds, fromguid,
 		    featureflags);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		objset_t *os;
 
 		/* healing recv must be done "into" an existing snapshot */
 		if (drba->drba_cookie->drc_heal == B_TRUE)
 			return (SET_ERROR(ENOTSUP));
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
 		if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
 		    drba->drba_origin))
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * If we're receiving a full send as a clone, and it doesn't
 		 * contain all the necessary free records and freeobject
 		 * records, reject it.
 		 */
 		if (fromguid == 0 && drba->drba_origin != NULL &&
 		    !(flags & DRR_FLAG_FREERECORDS))
 			return (SET_ERROR(EINVAL));
 
 		/* Open the parent of tofs */
 		ASSERT3U(strlen(tofs), <, sizeof (buf));
 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    drba->drba_origin == NULL) {
 			boolean_t will_encrypt;
 
 			/*
 			 * Check that we aren't breaking any encryption rules
 			 * and that we have all the parameters we need to
 			 * create an encrypted dataset if necessary. If we are
 			 * making an encrypted dataset the stream can't have
 			 * embedded data.
 			 */
 			error = dmu_objset_create_crypt_check(ds->ds_dir,
 			    drba->drba_dcp, &will_encrypt);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 
 			if (will_encrypt &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 
 		/*
 		 * Check filesystem and snapshot limits before receiving. We'll
 		 * recheck snapshot limits again at the end (we create the
 		 * filesystems and increment those counts during begin_sync).
 		 */
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 		    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
 		    drba->drba_cred, drba->drba_proc);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 
 		/* can't recv below anything but filesystems (eg. no ZVOLs) */
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 		}
 
 		if (drba->drba_origin != NULL) {
 			dsl_dataset_t *origin;
 			error = dsl_dataset_hold_flags(dp, drba->drba_origin,
 			    dsflags, FTAG, &origin);
 			if (error != 0) {
 				dsl_dataset_rele(ds, FTAG);
 				return (error);
 			}
 			if (!origin->ds_is_snapshot) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 			    fromguid != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(ENODEV));
 			}
 
 			if (origin->ds_dir->dd_crypto_obj != 0 &&
 			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele(ds, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			/*
 			 * If the origin is redacted we need to verify that this
 			 * send stream can safely be received on top of the
 			 * origin.
 			 */
 			if (dsl_dataset_feature_is_active(origin,
 			    SPA_FEATURE_REDACTED_DATASETS)) {
 				if (!redact_check(drba, origin)) {
 					dsl_dataset_rele_flags(origin, dsflags,
 					    FTAG);
 					dsl_dataset_rele_flags(ds, dsflags,
 					    FTAG);
 					return (SET_ERROR(EINVAL));
 				}
 			}
 
 			error = recv_check_large_blocks(ds, featureflags);
 			if (error != 0) {
 				dsl_dataset_rele_flags(origin, dsflags, FTAG);
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (error);
 			}
 
 			dsl_dataset_rele_flags(origin, dsflags, FTAG);
 		}
 
 		dsl_dataset_rele(ds, FTAG);
 		error = 0;
 	}
 	return (error);
 }
 
 static void
 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	struct drr_begin *drrb = drc->drc_drrb;
 	const char *tofs = drc->drc_tofs;
 	uint64_t featureflags = drc->drc_featureflags;
 	dsl_dataset_t *ds, *newds;
 	objset_t *os;
 	uint64_t dsobj;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	int error;
 	uint64_t crflags = 0;
 	dsl_crypto_params_t dummy_dcp = { 0 };
 	dsl_crypto_params_t *dcp = drba->drba_dcp;
 
 	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
 		crflags |= DS_FLAG_CI_DATASET;
 
 	if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 
 	/*
 	 * Raw, non-incremental recvs always use a dummy dcp with
 	 * the raw cmd set. Raw incremental recvs do not use a dcp
 	 * since the encryption parameters are already set in stone.
 	 */
 	if (dcp == NULL && drrb->drr_fromguid == 0 &&
 	    drba->drba_origin == NULL) {
 		ASSERT3P(dcp, ==, NULL);
 		dcp = &dummy_dcp;
 
 		if (featureflags & DMU_BACKUP_FEATURE_RAW)
 			dcp->cp_cmd = DCP_CMD_RAW_RECV;
 	}
 
 	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 	if (error == 0) {
 		/* Create temporary clone unless we're doing corrective recv */
 		dsl_dataset_t *snap = NULL;
 
 		if (drba->drba_cookie->drc_fromsnapobj != 0) {
 			VERIFY0(dsl_dataset_hold_obj(dp,
 			    drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
 			ASSERT3P(dcp, ==, NULL);
 		}
 		if (drc->drc_heal) {
 			/* When healing we want to use the provided snapshot */
 			VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap,
 			    &dsobj));
 		} else {
 			dsobj = dsl_dataset_create_sync(ds->ds_dir,
 			    recv_clone_name, snap, crflags, drba->drba_cred,
 			    dcp, tx);
 		}
 		if (drba->drba_cookie->drc_fromsnapobj != 0)
 			dsl_dataset_rele(snap, FTAG);
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	} else {
 		dsl_dir_t *dd;
 		const char *tail;
 		dsl_dataset_t *origin = NULL;
 
 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 
 		if (drba->drba_origin != NULL) {
 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 			    FTAG, &origin));
 			ASSERT3P(dcp, ==, NULL);
 		}
 
 		/* Create new dataset. */
 		dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
 		    origin, crflags, drba->drba_cred, dcp, tx);
 		if (origin != NULL)
 			dsl_dataset_rele(origin, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		drc->drc_newfs = B_TRUE;
 	}
 	VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
 	    &newds));
 	if (dsl_dataset_feature_is_active(newds,
 	    SPA_FEATURE_REDACTED_DATASETS)) {
 		/*
 		 * If the origin dataset is redacted, the child will be redacted
 		 * when we create it.  We clear the new dataset's
 		 * redaction info; if it should be redacted, we'll fill
 		 * in its information later.
 		 */
 		dsl_dataset_deactivate_feature(newds,
 		    SPA_FEATURE_REDACTED_DATASETS, tx);
 	}
 	VERIFY0(dmu_objset_from_ds(newds, &os));
 
 	if (drc->drc_resumable) {
 		dsl_dataset_zapify(newds, tx);
 		if (drrb->drr_fromguid != 0) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
 			    8, 1, &drrb->drr_fromguid, tx));
 		}
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
 		    8, 1, &drrb->drr_toguid, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
 		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
 		uint64_t one = 1;
 		uint64_t zero = 0;
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
 		    8, 1, &one, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
 		    8, 1, &zero, tx));
 		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
 		    8, 1, &zero, tx));
 		if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
 			    8, 1, &one, tx));
 		}
 		if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
 			    8, 1, &one, tx));
 		}
 
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
 		    &numredactsnaps) == 0) {
 			VERIFY0(zap_add(mos, dsobj,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
 			    sizeof (*redact_snaps), numredactsnaps,
 			    redact_snaps, tx));
 		}
 	}
 
 	/*
 	 * Usually the os->os_encrypted value is tied to the presence of a
 	 * DSL Crypto Key object in the dd. However, that will not be received
 	 * until dmu_recv_stream(), so we set the value manually for now.
 	 */
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		os->os_encrypted = B_TRUE;
 		drba->drba_cookie->drc_raw = B_TRUE;
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t *redact_snaps;
 		uint_t numredactsnaps;
 		VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
 		dsl_dataset_activate_redaction(newds, redact_snaps,
 		    numredactsnaps, tx);
 	}
 
 	if (featureflags & DMU_BACKUP_FEATURE_LARGE_MICROZAP) {
 		/*
 		 * The source has seen a large microzap at least once in its
 		 * life, so we activate the feature here to match. It's not
 		 * strictly necessary since a large microzap is usable without
 		 * the feature active, but if that object is sent on from here,
 		 * we need this info to know to add the stream feature.
 		 *
 		 * There may be no large microzap in the incoming stream, or
 		 * ever again, but this is a very niche feature and its very
 		 * difficult to spot a large microzap in the stream, so its
 		 * not worth the effort of trying harder to activate the
 		 * feature at first use.
 		 */
 		dsl_dataset_activate_feature(dsobj, SPA_FEATURE_LARGE_MICROZAP,
 		    (void *)B_TRUE, tx);
 	}
 
 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
 	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 
 	/*
 	 * Activate longname feature if received
 	 */
 	if (featureflags & DMU_BACKUP_FEATURE_LONGNAME &&
 	    !dsl_dataset_feature_is_active(newds, SPA_FEATURE_LONGNAME)) {
 		dsl_dataset_activate_feature(newds->ds_object,
 		    SPA_FEATURE_LONGNAME, (void *)B_TRUE, tx);
 		newds->ds_feature[SPA_FEATURE_LONGNAME] = (void *)B_TRUE;
 	}
 
 	/*
 	 * If we actually created a non-clone, we need to create the objset
 	 * in our new dataset. If this is a raw send we postpone this until
 	 * dmu_recv_stream() so that we can allocate the metadnode with the
 	 * properties from the DRR_BEGIN payload.
 	 */
 	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
 	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 	    !drc->drc_heal) {
 		(void) dmu_objset_create_impl(dp->dp_spa,
 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 	}
 	rrw_exit(&newds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = newds;
 	drba->drba_cookie->drc_os = os;
 
 	spa_history_log_internal_ds(newds, "receive", tx, " ");
 }
 
 static int
 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dmu_recv_cookie_t *drc = drba->drba_cookie;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	struct drr_begin *drrb = drc->drc_drrb;
 	int error;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	dsl_dataset_t *ds;
 	const char *tofs = drc->drc_tofs;
 
 	/* already checked */
 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 	ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 	    DMU_COMPOUNDSTREAM ||
 	    drrb->drr_type >= DMU_OST_NUMTYPES)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is mostly a sanity check since we should have already done these
 	 * checks during a previous attempt to receive the data.
 	 */
 	error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
 	    dp->dp_spa);
 	if (error != 0)
 		return (error);
 
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
 	    tofs, recv_clone_name);
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		/* raw receives require spill block allocation flag */
 		if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
 			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	boolean_t recvexist = B_TRUE;
 	if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 		/* %recv does not exist; continue in tofs */
 		recvexist = B_FALSE;
 		error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Resume of full/newfs recv on existing dataset should be done with
 	 * force flag
 	 */
 	if (recvexist && drrb->drr_fromguid == 0 && !drc->drc_force) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(ZFS_ERR_RESUME_EXISTS));
 	}
 
 	/* check that ds is marked inconsistent */
 	if (!DS_IS_INCONSISTENT(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* check that there is resuming data, and that the toguid matches */
 	if (!dsl_dataset_is_zapified(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	uint64_t val;
 	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
 	if (error != 0 || drrb->drr_toguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check if the receive is still running.  If so, it will be owned.
 	 * Note that nothing else can own the dataset (e.g. after the receive
 	 * fails) because it will be marked inconsistent.
 	 */
 	if (dsl_dataset_has_owner(ds)) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/* There should not be any snapshots of this fs yet. */
 	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: resume point will be checked when we process the first WRITE
 	 * record.
 	 */
 
 	/* check that the origin matches */
 	val = 0;
 	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
 	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
 	if (drrb->drr_fromguid != val) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (ds->ds_prev != NULL && drrb->drr_fromguid != 0)
 		drc->drc_fromsnapobj = ds->ds_prev->ds_object;
 
 	/*
 	 * If we're resuming, and the send is redacted, then the original send
 	 * must have been redacted, and must have been redacted with respect to
 	 * the same snapshots.
 	 */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
 		uint64_t num_ds_redact_snaps;
 		uint64_t *ds_redact_snaps;
 
 		uint_t num_stream_redact_snaps;
 		uint64_t *stream_redact_snaps;
 
 		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
 		    BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
 		    &num_stream_redact_snaps) != 0) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (!dsl_dataset_get_uint64_array_feature(ds,
 		    SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
 		    &ds_redact_snaps)) {
 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		for (int i = 0; i < num_ds_redact_snaps; i++) {
 			if (!redact_snaps_contains(ds_redact_snaps,
 			    num_ds_redact_snaps, stream_redact_snaps[i])) {
 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	error = recv_check_large_blocks(ds, drc->drc_featureflags);
 	if (error != 0) {
 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_rele_flags(ds, dsflags, FTAG);
 	return (0);
 }
 
 static void
 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_begin_arg_t *drba = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	const char *tofs = drba->drba_cookie->drc_tofs;
 	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
 	dsl_dataset_t *ds;
 	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
 	/* 6 extra bytes for /%recv */
 	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 	(void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
 	    recv_clone_name);
 
 	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 		drba->drba_cookie->drc_raw = B_TRUE;
 	} else {
 		dsflags |= DS_HOLD_FLAG_DECRYPT;
 	}
 
 	if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
 	    != 0) {
 		/* %recv does not exist; continue in tofs */
 		VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
 		    &ds));
 		drba->drba_cookie->drc_newfs = B_TRUE;
 	}
 
 	ASSERT(DS_IS_INCONSISTENT(ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
 	    drba->drba_cookie->drc_raw);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	drba->drba_cookie->drc_ds = ds;
 	VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
 	drba->drba_cookie->drc_should_save = B_TRUE;
 
 	spa_history_log_internal_ds(ds, "resume receive", tx, " ");
 }
 
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
 dmu_recv_begin(const char *tofs, const char *tosnap,
     dmu_replay_record_t *drr_begin, boolean_t force, boolean_t heal,
     boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args,
     const char *origin, dmu_recv_cookie_t *drc, zfs_file_t *fp,
     offset_t *voffp)
 {
 	dmu_recv_begin_arg_t drba = { 0 };
 	int err = 0;
 
 	memset(drc, 0, sizeof (dmu_recv_cookie_t));
 	drc->drc_drr_begin = drr_begin;
 	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 	drc->drc_tosnap = tosnap;
 	drc->drc_tofs = tofs;
 	drc->drc_force = force;
 	drc->drc_heal = heal;
 	drc->drc_resumable = resumable;
 	drc->drc_cred = CRED();
 	drc->drc_proc = curproc;
 	drc->drc_clone = (origin != NULL);
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		drc->drc_byteswap = B_TRUE;
 		(void) fletcher_4_incremental_byteswap(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 		byteswap_record(drr_begin);
 	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
 		(void) fletcher_4_incremental_native(drr_begin,
 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 
 	drc->drc_fp = fp;
 	drc->drc_voff = *voffp;
 	drc->drc_featureflags =
 	    DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
 
 	/*
 	 * Since OpenZFS 2.0.0, we have enforced a 64MB limit in userspace
 	 * configurable via ZFS_SENDRECV_MAX_NVLIST. We enforce 256MB as a hard
 	 * upper limit. Systems with less than 1GB of RAM will see a lower
 	 * limit from `arc_all_memory() / 4`.
 	 */
 	if (payloadlen > (MIN((1U << 28), arc_all_memory() / 4)))
 		return (E2BIG);
 
 
 	if (payloadlen != 0) {
 		void *payload = vmem_alloc(payloadlen, KM_SLEEP);
 		/*
 		 * For compatibility with recursive send streams, we don't do
 		 * this here if the stream could be part of a package. Instead,
 		 * we'll do it in dmu_recv_stream. If we pull the next header
 		 * too early, and it's the END record, we break the `recv_skip`
 		 * logic.
 		 */
 
 		err = receive_read_payload_and_next_header(drc, payloadlen,
 		    payload);
 		if (err != 0) {
 			vmem_free(payload, payloadlen);
 			return (err);
 		}
 		err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
 		    KM_SLEEP);
 		vmem_free(payload, payloadlen);
 		if (err != 0) {
 			kmem_free(drc->drc_next_rrd,
 			    sizeof (*drc->drc_next_rrd));
 			return (err);
 		}
 	}
 
 	if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
 		drc->drc_spill = B_TRUE;
 
 	drba.drba_origin = origin;
 	drba.drba_cookie = drc;
 	drba.drba_cred = CRED();
 	drba.drba_proc = curproc;
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = dsl_sync_task(tofs,
 		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 		    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 	} else {
 		/*
 		 * For non-raw, non-incremental, non-resuming receives the
 		 * user can specify encryption parameters on the command line
 		 * with "zfs recv -o". For these receives we create a dcp and
 		 * pass it to the sync task. Creating the dcp will implicitly
 		 * remove the encryption params from the localprops nvlist,
 		 * which avoids errors when trying to set these normally
 		 * read-only properties. Any other kind of receive that
 		 * attempts to set these properties will fail as a result.
 		 */
 		if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 		    DMU_BACKUP_FEATURE_RAW) == 0 &&
 		    origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
 			err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 			    localprops, hidden_args, &drba.drba_dcp);
 		}
 
 		if (err == 0) {
 			err = dsl_sync_task(tofs,
 			    dmu_recv_begin_check, dmu_recv_begin_sync,
 			    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 			dsl_crypto_params_free(drba.drba_dcp, !!err);
 		}
 	}
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		nvlist_free(drc->drc_begin_nvl);
 	}
 	return (err);
 }
 
 /*
  * Holds data need for corrective recv callback
  */
 typedef struct cr_cb_data {
 	uint64_t size;
 	zbookmark_phys_t zb;
 	spa_t *spa;
 } cr_cb_data_t;
 
 static void
 corrective_read_done(zio_t *zio)
 {
 	cr_cb_data_t *data = zio->io_private;
 	/* Corruption corrected; update error log if needed */
 	if (zio->io_error == 0) {
 		spa_remove_error(data->spa, &data->zb,
 		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
 }
 
 /*
  * zio_rewrite the data pointed to by bp with the data from the rrd's abd.
  */
 static int
 do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
     struct receive_record_arg *rrd, blkptr_t *bp)
 {
 	int err;
 	zio_t *io;
 	zbookmark_phys_t zb;
 	dnode_t *dn;
 	abd_t *abd = rrd->abd;
 	zio_cksum_t bp_cksum = bp->blk_cksum;
 	zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_CANFAIL;
 
 	if (rwa->raw)
 		flags |= ZIO_FLAG_RAW;
 
 	err = dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), drrw->drr_object, 0,
 	    dbuf_whichblock(dn, 0, drrw->drr_offset));
 	dnode_rele(dn, FTAG);
 
 	if (!rwa->raw && DRR_WRITE_COMPRESSED(drrw)) {
 		/* Decompress the stream data */
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
 		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);
 
 		if (err != 0) {
 			abd_free(dabd);
 			return (err);
 		}
 		/* Swap in the newly decompressed data into the abd */
 		abd_free(abd);
 		abd = dabd;
 	}
 
 	if (!rwa->raw && BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
 		    abd, &cabd, abd_get_size(abd), BP_GET_PSIZE(bp),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
 		abd_free(abd);
 		abd = cabd;
 		flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 
 	/*
 	 * The stream is not encrypted but the data on-disk is.
 	 * We need to re-encrypt the buf using the same
 	 * encryption type, salt, iv, and mac that was used to encrypt
 	 * the block previosly.
 	 */
 	if (!rwa->raw && BP_USES_CRYPT(bp)) {
 		dsl_dataset_t *ds;
 		dsl_crypto_key_t *dck = NULL;
 		uint8_t salt[ZIO_DATA_SALT_LEN];
 		uint8_t iv[ZIO_DATA_IV_LEN];
 		uint8_t mac[ZIO_DATA_MAC_LEN];
 		boolean_t no_crypt = B_FALSE;
 		dsl_pool_t *dp = dmu_objset_pool(rwa->os);
 		abd_t *eabd = abd_alloc_linear(BP_GET_PSIZE(bp), B_FALSE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		dsl_pool_config_enter(dp, FTAG);
 		err = dsl_dataset_hold_flags(dp, rwa->tofs,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (err != 0) {
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		/* Look up the key from the spa's keystore */
 		err = spa_keystore_lookup_key(rwa->os->os_spa,
 		    zb.zb_objset, FTAG, &dck);
 		if (err != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
 			    FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			abd_free(eabd);
 			return (SET_ERROR(EACCES));
 		}
 
 		err = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv,
 		    mac, abd_get_size(abd), abd, eabd, &no_crypt);
 
 		spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		ASSERT0(no_crypt);
 		if (err != 0) {
 			abd_free(eabd);
 			return (err);
 		}
 		/* Swap in the newly encrypted data into the abd */
 		abd_free(abd);
 		abd = eabd;
 
 		/*
 		 * We want to prevent zio_rewrite() from trying to
 		 * encrypt the data again
 		 */
 		flags |= ZIO_FLAG_RAW_ENCRYPT;
 	}
 	rrd->abd = abd;
 
 	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
 	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
 	    &zb);
 
 	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
 	    abd_get_size(abd) == BP_GET_PSIZE(bp));
 
 	/* compute new bp checksum value and make sure it matches the old one */
 	zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, abd_get_size(abd));
 	if (!ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) {
 		zio_destroy(io);
 		if (zfs_recv_best_effort_corrective != 0)
 			return (0);
 		return (SET_ERROR(ECKSUM));
 	}
 
 	/* Correct the corruption in place */
 	err = zio_wait(io);
 	if (err == 0) {
 		cr_cb_data_t *cb_data =
 		    kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP);
 		cb_data->spa = rwa->os->os_spa;
 		cb_data->size = drrw->drr_logical_size;
 		cb_data->zb = zb;
 		/* Test if healing worked by re-reading the bp */
 		err = zio_wait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp,
 		    abd_alloc_for_io(drrw->drr_logical_size, B_FALSE),
 		    drrw->drr_logical_size, corrective_read_done,
 		    cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL));
 	}
 	if (err != 0 && zfs_recv_best_effort_corrective != 0)
 		err = 0;
 
 	return (err);
 }
 
 static int
 receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int done = 0;
 
 	/*
 	 * The code doesn't rely on this (lengths being multiples of 8).  See
 	 * comment in dump_bytes.
 	 */
 	ASSERT(len % 8 == 0 ||
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 
 	while (done < len) {
 		ssize_t resid = len - done;
 		zfs_file_t *fp = drc->drc_fp;
 		int err = zfs_file_read(fp, (char *)buf + done,
 		    len - done, &resid);
 		if (err == 0 && resid == len - done) {
 			/*
 			 * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
 			 * that the receive was interrupted and can
 			 * potentially be resumed.
 			 */
 			err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED);
 		}
 		drc->drc_voff += len - done - resid;
 		done = len - resid;
 		if (err != 0)
 			return (err);
 	}
 
 	drc->drc_bytes_read += len;
 
 	ASSERT3U(done, ==, len);
 	return (0);
 }
 
 static inline uint8_t
 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 {
 	if (bonus_type == DMU_OT_SA) {
 		return (1);
 	} else {
 		return (1 +
 		    ((DN_OLD_MAX_BONUSLEN -
 		    MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
 	}
 }
 
 static void
 save_resume_state(struct receive_writer_arg *rwa,
     uint64_t object, uint64_t offset, dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (!rwa->resumable)
 		return;
 
 	/*
 	 * We use ds_resume_bytes[] != 0 to indicate that we need to
 	 * update this on disk, so it must not be 0.
 	 */
 	ASSERT(rwa->bytes_read != 0);
 
 	/*
 	 * We only resume from write records, which have a valid
 	 * (non-meta-dnode) object number.
 	 */
 	ASSERT(object != 0);
 
 	/*
 	 * For resuming to work correctly, we must receive records in order,
 	 * sorted by object,offset.  This is checked by the callers, but
 	 * assert it here for good measure.
 	 */
 	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
 	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
 	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
 	ASSERT3U(rwa->bytes_read, >=,
 	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
 
 	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
 	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
 	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
 }
 
 static int
 receive_object_is_same_generation(objset_t *os, uint64_t object,
     dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type,
     const void *new_bonus, boolean_t *samegenp)
 {
 	zfs_file_info_t zoi;
 	int err;
 
 	dmu_buf_t *old_bonus_dbuf;
 	err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf);
 	if (err != 0)
 		return (err);
 	err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data,
 	    &zoi);
 	dmu_buf_rele(old_bonus_dbuf, FTAG);
 	if (err != 0)
 		return (err);
 	uint64_t old_gen = zoi.zfi_generation;
 
 	err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi);
 	if (err != 0)
 		return (err);
 	uint64_t new_gen = zoi.zfi_generation;
 
 	*samegenp = (old_gen == new_gen);
 	return (0);
 }
 
 static int
 receive_handle_existing_object(const struct receive_writer_arg *rwa,
     const struct drr_object *drro, const dmu_object_info_t *doi,
     const void *bonus_data,
     uint64_t *object_to_hold, uint32_t *new_blksz)
 {
 	uint32_t indblksz = drro->drr_indblkshift ?
 	    1ULL << drro->drr_indblkshift : 0;
 	int nblkptr = deduce_nblkptr(drro->drr_bonustype,
 	    drro->drr_bonuslen);
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 	boolean_t do_free_range = B_FALSE;
 	int err;
 
 	*object_to_hold = drro->drr_object;
 
 	/* nblkptr should be bounded by the bonus size and type */
 	if (rwa->raw && nblkptr != drro->drr_nblkptr)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * After the previous send stream, the sending system may
 	 * have freed this object, and then happened to re-allocate
 	 * this object number in a later txg. In this case, we are
 	 * receiving a different logical file, and the block size may
 	 * appear to be different.  i.e. we may have a different
 	 * block size for this object than what the send stream says.
 	 * In this case we need to remove the object's contents,
 	 * so that its structure can be changed and then its contents
 	 * entirely replaced by subsequent WRITE records.
 	 *
 	 * If this is a -L (--large-block) incremental stream, and
 	 * the previous stream was not -L, the block size may appear
 	 * to increase.  i.e. we may have a smaller block size for
 	 * this object than what the send stream says.  In this case
 	 * we need to keep the object's contents and block size
 	 * intact, so that we don't lose parts of the object's
 	 * contents that are not changed by this incremental send
 	 * stream.
 	 *
 	 * We can distinguish between the two above cases by using
 	 * the ZPL's generation number (see
 	 * receive_object_is_same_generation()).  However, we only
 	 * want to rely on the generation number when absolutely
 	 * necessary, because with raw receives, the generation is
 	 * encrypted.  We also want to minimize dependence on the
 	 * ZPL, so that other types of datasets can also be received
 	 * (e.g. ZVOLs, although note that ZVOLS currently do not
 	 * reallocate their objects or change their structure).
 	 * Therefore, we check a number of different cases where we
 	 * know it is safe to discard the object's contents, before
 	 * using the ZPL's generation number to make the above
 	 * distinction.
 	 */
 	if (drro->drr_blksz != doi->doi_data_block_size) {
 		if (rwa->raw) {
 			/*
 			 * RAW streams always have large blocks, so
 			 * we are sure that the data is not needed
 			 * due to changing --large-block to be on.
 			 * Which is fortunate since the bonus buffer
 			 * (which contains the ZPL generation) is
 			 * encrypted, and the key might not be
 			 * loaded.
 			 */
 			do_free_range = B_TRUE;
 		} else if (rwa->full) {
 			/*
 			 * This is a full send stream, so it always
 			 * replaces what we have.  Even if the
 			 * generation numbers happen to match, this
 			 * can not actually be the same logical file.
 			 * This is relevant when receiving a full
 			 * send as a clone.
 			 */
 			do_free_range = B_TRUE;
 		} else if (drro->drr_type !=
 		    DMU_OT_PLAIN_FILE_CONTENTS ||
 		    doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) {
 			/*
 			 * PLAIN_FILE_CONTENTS are the only type of
 			 * objects that have ever been stored with
 			 * large blocks, so we don't need the special
 			 * logic below.  ZAP blocks can shrink (when
 			 * there's only one block), so we don't want
 			 * to hit the error below about block size
 			 * only increasing.
 			 */
 			do_free_range = B_TRUE;
 		} else if (doi->doi_max_offset <=
 		    doi->doi_data_block_size) {
 			/*
 			 * There is only one block.  We can free it,
 			 * because its contents will be replaced by a
 			 * WRITE record.  This can not be the no-L ->
 			 * -L case, because the no-L case would have
 			 * resulted in multiple blocks.  If we
 			 * supported -L -> no-L, it would not be safe
 			 * to free the file's contents.  Fortunately,
 			 * that is not allowed (see
 			 * recv_check_large_blocks()).
 			 */
 			do_free_range = B_TRUE;
 		} else {
 			boolean_t is_same_gen;
 			err = receive_object_is_same_generation(rwa->os,
 			    drro->drr_object, doi->doi_bonus_type,
 			    drro->drr_bonustype, bonus_data, &is_same_gen);
 			if (err != 0)
 				return (SET_ERROR(EINVAL));
 
 			if (is_same_gen) {
 				/*
 				 * This is the same logical file, and
 				 * the block size must be increasing.
 				 * It could only decrease if
 				 * --large-block was changed to be
 				 * off, which is checked in
 				 * recv_check_large_blocks().
 				 */
 				if (drro->drr_blksz <=
 				    doi->doi_data_block_size)
 					return (SET_ERROR(EINVAL));
 				/*
 				 * We keep the existing blocksize and
 				 * contents.
 				 */
 				*new_blksz =
 				    doi->doi_data_block_size;
 			} else {
 				do_free_range = B_TRUE;
 			}
 		}
 	}
 
 	/* nblkptr can only decrease if the object was reallocated */
 	if (nblkptr < doi->doi_nblkptr)
 		do_free_range = B_TRUE;
 
 	/* number of slots can only change on reallocation */
 	if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT)
 		do_free_range = B_TRUE;
 
 	/*
 	 * For raw sends we also check a few other fields to
 	 * ensure we are preserving the objset structure exactly
 	 * as it was on the receive side:
 	 *     - A changed indirect block size
 	 *     - A smaller nlevels
 	 */
 	if (rwa->raw) {
 		if (indblksz != doi->doi_metadata_block_size)
 			do_free_range = B_TRUE;
 		if (drro->drr_nlevels < doi->doi_indirection)
 			do_free_range = B_TRUE;
 	}
 
 	if (do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    0, DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The dmu does not currently support decreasing nlevels or changing
 	 * indirect block size if there is already one, same as changing the
 	 * number of of dnode slots on an object.  For non-raw sends this
 	 * does not matter and the new object can just use the previous one's
 	 * parameters.  For raw sends, however, the structure of the received
 	 * dnode (including indirects and dnode slots) must match that of the
 	 * send side.  Therefore, instead of using dmu_object_reclaim(), we
 	 * must free the object completely and call dmu_object_claim_dnsize()
 	 * instead.
 	 */
 	if ((rwa->raw && ((doi->doi_indirection > 1 &&
 	    indblksz != doi->doi_metadata_block_size) ||
 	    drro->drr_nlevels < doi->doi_indirection)) ||
 	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
 		err = dmu_free_long_object(rwa->os, drro->drr_object);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 		*object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/*
 	 * For raw receives, free everything beyond the new incoming
 	 * maxblkid. Normally this would be done with a DRR_FREE
 	 * record that would come after this DRR_OBJECT record is
 	 * processed. However, for raw receives we manually set the
 	 * maxblkid from the drr_maxblkid and so we must first free
 	 * everything above that blkid to ensure the DMU is always
 	 * consistent with itself. We will never free the first block
 	 * of the object here because a maxblkid of 0 could indicate
 	 * an object with a single block or one with no blocks. This
 	 * free may be skipped when dmu_free_long_range() was called
 	 * above since it covers the entire object's contents.
 	 */
 	if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) {
 		err = dmu_free_long_range(rwa->os, drro->drr_object,
 		    (drro->drr_maxblkid + 1) * doi->doi_data_block_size,
 		    DMU_OBJECT_END);
 		if (err != 0)
 			return (SET_ERROR(EINVAL));
 	}
 	return (0);
 }
 
 noinline static int
 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
     void *data)
 {
 	dmu_object_info_t doi;
 	dmu_tx_t *tx;
 	int err;
 	uint32_t new_blksz = drro->drr_blksz;
 	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
 	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
 
 	if (drro->drr_type == DMU_OT_NONE ||
 	    !DMU_OT_IS_VALID(drro->drr_type) ||
 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
 	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen >
 	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
 	    dn_slots >
 	    (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->raw) {
 		/*
 		 * We should have received a DRR_OBJECT_RANGE record
 		 * containing this block and stored it in rwa.
 		 */
 		if (drro->drr_object < rwa->or_firstobj ||
 		    drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
 		    drro->drr_raw_bonuslen < drro->drr_bonuslen ||
 		    drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
 		    drro->drr_nlevels > DN_MAX_LEVELS ||
 		    drro->drr_nblkptr > DN_MAX_NBLKPTR ||
 		    DN_SLOTS_TO_BONUSLEN(dn_slots) <
 		    drro->drr_raw_bonuslen)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
 		 * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
 		 */
 		if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
 		    (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
 		    drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT && err != EEXIST)
 		return (SET_ERROR(EINVAL));
 
 	if (drro->drr_object > rwa->max_object)
 		rwa->max_object = drro->drr_object;
 
 	/*
 	 * If we are losing blkptrs or changing the block size this must
 	 * be a new file instance.  We must clear out the previous file
 	 * contents before we can change this type of metadata in the dnode.
 	 * Raw receives will also check that the indirect structure of the
 	 * dnode hasn't changed.
 	 */
 	uint64_t object_to_hold;
 	if (err == 0) {
 		err = receive_handle_existing_object(rwa, drro, &doi, data,
 		    &object_to_hold, &new_blksz);
 		if (err != 0)
 			return (err);
 	} else if (err == EEXIST) {
 		/*
 		 * The object requested is currently an interior slot of a
 		 * multi-slot dnode. This will be resolved when the next txg
 		 * is synced out, since the send stream will have told us
 		 * to free this slot when we freed the associated dnode
 		 * earlier in the stream.
 		 */
 		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		/* object was freed and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	} else {
 		/*
 		 * If the only record in this range so far was DRR_FREEOBJECTS
 		 * with at least one actually freed object, it's possible that
 		 * the block will now be converted to a hole. We need to wait
 		 * for the txg to sync to prevent races.
 		 */
 		if (rwa->or_need_sync == ORNS_YES)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 
 		/* object is free and we are about to allocate a new one */
 		object_to_hold = DMU_NEW_OBJECT;
 	}
 
 	/* Only relevant for the first object in the range */
 	rwa->or_need_sync = ORNS_NO;
 
 	/*
 	 * If this is a multi-slot dnode there is a chance that this
 	 * object will expand into a slot that is already used by
 	 * another object from the previous snapshot. We must free
 	 * these objects before we attempt to allocate the new dnode.
 	 */
 	if (dn_slots > 1) {
 		boolean_t need_sync = B_FALSE;
 
 		for (uint64_t slot = drro->drr_object + 1;
 		    slot < drro->drr_object + dn_slots;
 		    slot++) {
 			dmu_object_info_t slot_doi;
 
 			err = dmu_object_info(rwa->os, slot, &slot_doi);
 			if (err == ENOENT || err == EEXIST)
 				continue;
 			else if (err != 0)
 				return (err);
 
 			err = dmu_free_long_object(rwa->os, slot);
 			if (err != 0)
 				return (err);
 
 			need_sync = B_TRUE;
 		}
 
 		if (need_sync)
 			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
 	}
 
 	tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_bonus(tx, object_to_hold);
 	dmu_tx_hold_write(tx, object_to_hold, 0, 0);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	if (object_to_hold == DMU_NEW_OBJECT) {
 		/* Currently free, wants to be allocated */
 		err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, tx);
 	} else if (drro->drr_type != doi.doi_type ||
 	    new_blksz != doi.doi_data_block_size ||
 	    drro->drr_bonustype != doi.doi_bonus_type ||
 	    drro->drr_bonuslen != doi.doi_bonus_size) {
 		/* Currently allocated, but with different properties */
 		err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
 		    drro->drr_type, new_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen,
 		    dn_slots << DNODE_SHIFT, rwa->spill ?
 		    DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
 	} else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
 		/*
 		 * Currently allocated, the existing version of this object
 		 * may reference a spill block that is no longer allocated
 		 * at the source and needs to be freed.
 		 */
 		err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
 	}
 
 	if (err != 0) {
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (rwa->or_crypt_params_present) {
 		/*
 		 * Set the crypt params for the buffer associated with this
 		 * range of dnodes.  This causes the blkptr_t to have the
 		 * same crypt params (byteorder, salt, iv, mac) as on the
 		 * sending side.
 		 *
 		 * Since we are committing this tx now, it is possible for
 		 * the dnode block to end up on-disk with the incorrect MAC,
 		 * if subsequent objects in this block are received in a
 		 * different txg.  However, since the dataset is marked as
 		 * inconsistent, no code paths will do a non-raw read (or
 		 * decrypt the block / verify the MAC). The receive code and
 		 * scrub code can safely do raw reads and verify the
 		 * checksum.  They don't need to verify the MAC.
 		 */
 		dmu_buf_t *db = NULL;
 		uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
 
 		err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
 		    offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
 		if (err != 0) {
 			dmu_tx_commit(tx);
 			return (SET_ERROR(EINVAL));
 		}
 
 		dmu_buf_set_crypt_params(db, rwa->or_byteorder,
 		    rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
 
 		dmu_buf_rele(db, FTAG);
 
 		rwa->or_crypt_params_present = B_FALSE;
 	}
 
 	dmu_object_set_checksum(rwa->os, drro->drr_object,
 	    drro->drr_checksumtype, tx);
 	dmu_object_set_compress(rwa->os, drro->drr_object,
 	    drro->drr_compress, tx);
 
 	/* handle more restrictive dnode structuring for raw recvs */
 	if (rwa->raw) {
 		/*
 		 * Set the indirect block size, block shift, nlevels.
 		 * This will not fail because we ensured all of the
 		 * blocks were freed earlier if this is a new object.
 		 * For non-new objects block size and indirect block
 		 * shift cannot change and nlevels can only increase.
 		 */
 		ASSERT3U(new_blksz, ==, drro->drr_blksz);
 		VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
 		    drro->drr_blksz, drro->drr_indblkshift, tx));
 		VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
 		    drro->drr_nlevels, tx));
 
 		/*
 		 * Set the maxblkid. This will always succeed because
 		 * we freed all blocks beyond the new maxblkid above.
 		 */
 		VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
 		    drro->drr_maxblkid, tx));
 	}
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 		dnode_t *dn;
 		uint32_t flags = DMU_READ_NO_PREFETCH;
 
 		if (rwa->raw)
 			flags |= DMU_READ_NO_DECRYPT;
 
 		VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
 		VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
 
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		memcpy(db->db_data, data, DRR_OBJECT_PAYLOAD_SIZE(drro));
 
 		/*
 		 * Raw bonus buffers have their byteorder determined by the
 		 * DRR_OBJECT_RANGE record.
 		 */
 		if (rwa->byteswap && !rwa->raw) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    DRR_OBJECT_PAYLOAD_SIZE(drro));
 		}
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
 
 	/*
 	 * If the receive fails, we want the resume stream to start with the
 	 * same record that we last successfully received. There is no way to
 	 * request resume from the object record, but we can benefit from the
 	 * fact that sender always sends object record before anything else,
 	 * after which it will "resend" data at offset 0 and resume normally.
 	 */
 	save_resume_state(rwa, drro->drr_object, 0, tx);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 noinline static int
 receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
 	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
 		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
 	    obj < DN_MAX_OBJECT && next_err == 0;
 	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		dmu_object_info_t doi;
 		int err;
 
 		err = dmu_object_info(rwa->os, obj, &doi);
 		if (err == ENOENT)
 			continue;
 		else if (err != 0)
 			return (err);
 
 		err = dmu_free_long_object(rwa->os, obj);
 
 		if (err != 0)
 			return (err);
 
 		if (rwa->or_need_sync == ORNS_MAYBE)
 			rwa->or_need_sync = ORNS_YES;
 	}
 	if (next_err != ESRCH)
 		return (next_err);
 	return (0);
 }
 
 /*
  * Note: if this fails, the caller will clean up any records left on the
  * rwa->write_batch list.
  */
 static int
 flush_write_batch_impl(struct receive_writer_arg *rwa)
 {
 	dnode_t *dn;
 	int err;
 
 	if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0)
 		return (SET_ERROR(EINVAL));
 
 	struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch);
 	struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write;
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 
 	ASSERT3U(rwa->last_object, ==, last_drrw->drr_object);
 	ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset);
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 	dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset,
 	    last_drrw->drr_offset - first_drrw->drr_offset +
 	    last_drrw->drr_logical_size);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		dnode_rele(dn, FTAG);
 		return (err);
 	}
 
 	struct receive_record_arg *rrd;
 	while ((rrd = list_head(&rwa->write_batch)) != NULL) {
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		abd_t *abd = rrd->abd;
 
 		ASSERT3U(drrw->drr_object, ==, rwa->last_object);
 
 		if (drrw->drr_logical_size != dn->dn_datablksz) {
 			/*
 			 * The WRITE record is larger than the object's block
 			 * size.  We must be receiving an incremental
 			 * large-block stream into a dataset that previously did
 			 * a non-large-block receive.  Lightweight writes must
 			 * be exactly one block, so we need to decompress the
 			 * data (if compressed) and do a normal dmu_write().
 			 */
 			ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
 			if (DRR_WRITE_COMPRESSED(drrw)) {
 				abd_t *decomp_abd =
 				    abd_alloc_linear(drrw->drr_logical_size,
 				    B_FALSE);
 
 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
 				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);
 
 				if (err == 0) {
 					dmu_write_by_dnode(dn,
 					    drrw->drr_offset,
 					    drrw->drr_logical_size,
 					    abd_to_buf(decomp_abd), tx);
 				}
 				abd_free(decomp_abd);
 			} else {
 				dmu_write_by_dnode(dn,
 				    drrw->drr_offset,
 				    drrw->drr_logical_size,
 				    abd_to_buf(abd), tx);
 			}
 			if (err == 0)
 				abd_free(abd);
 		} else {
 			zio_prop_t zp = {0};
 			dmu_write_policy(rwa->os, dn, 0, 0, &zp);
 
 			zio_flag_t zio_flags = 0;
 
 			if (rwa->raw) {
 				zp.zp_encrypt = B_TRUE;
 				zp.zp_compress = drrw->drr_compressiontype;
 				zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
 				    !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
 				    rwa->byteswap;
 				memcpy(zp.zp_salt, drrw->drr_salt,
 				    ZIO_DATA_SALT_LEN);
 				memcpy(zp.zp_iv, drrw->drr_iv,
 				    ZIO_DATA_IV_LEN);
 				memcpy(zp.zp_mac, drrw->drr_mac,
 				    ZIO_DATA_MAC_LEN);
 				if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
 					zp.zp_nopwrite = B_FALSE;
 					zp.zp_copies = MIN(zp.zp_copies,
 					    SPA_DVAS_PER_BP - 1);
 				}
 				zio_flags |= ZIO_FLAG_RAW;
 			} else if (DRR_WRITE_COMPRESSED(drrw)) {
 				ASSERT3U(drrw->drr_compressed_size, >, 0);
 				ASSERT3U(drrw->drr_logical_size, >=,
 				    drrw->drr_compressed_size);
 				zp.zp_compress = drrw->drr_compressiontype;
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			} else if (rwa->byteswap) {
 				/*
 				 * Note: compressed blocks never need to be
 				 * byteswapped, because WRITE records for
 				 * metadata blocks are never compressed. The
 				 * exception is raw streams, which are written
 				 * in the original byteorder, and the byteorder
 				 * bit is preserved in the BP by setting
 				 * zp_byteorder above.
 				 */
 				dmu_object_byteswap_t byteswap =
 				    DMU_OT_BYTESWAP(drrw->drr_type);
 				dmu_ot_byteswap[byteswap].ob_func(
 				    abd_to_buf(abd),
 				    DRR_WRITE_PAYLOAD_SIZE(drrw));
 			}
 
 			/*
 			 * Since this data can't be read until the receive
 			 * completes, we can do a "lightweight" write for
 			 * improved performance.
 			 */
 			err = dmu_lightweight_write_by_dnode(dn,
 			    drrw->drr_offset, abd, &zp, zio_flags, tx);
 		}
 
 		if (err != 0) {
 			/*
 			 * This rrd is left on the list, so the caller will
 			 * free it (and the abd).
 			 */
 			break;
 		}
 
 		/*
 		 * Note: If the receive fails, we want the resume stream to
 		 * start with the same record that we last successfully
 		 * received (as opposed to the next record), so that we can
 		 * verify that we are resuming from the correct location.
 		 */
 		save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
 
 		list_remove(&rwa->write_batch, rrd);
 		kmem_free(rrd, sizeof (*rrd));
 	}
 
 	dmu_tx_commit(tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 noinline static int
 flush_write_batch(struct receive_writer_arg *rwa)
 {
 	if (list_is_empty(&rwa->write_batch))
 		return (0);
 	int err = rwa->err;
 	if (err == 0)
 		err = flush_write_batch_impl(rwa);
 	if (err != 0) {
 		struct receive_record_arg *rrd;
 		while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
 			abd_free(rrd->abd);
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	ASSERT(list_is_empty(&rwa->write_batch));
 	return (err);
 }
 
 noinline static int
 receive_process_write_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err = 0;
 
 	ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE);
 	struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 
 	if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
 	    !DMU_OT_IS_VALID(drrw->drr_type))
 		return (SET_ERROR(EINVAL));
 
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
 			flags |= DB_RF_NO_DECRYPT;
 
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrw->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(rrd->abd),
 			    DRR_WRITE_PAYLOAD_SIZE(drrw));
 		}
 
 		err = dmu_buf_hold_noread(rwa->os, drrw->drr_object,
 		    drrw->drr_offset, FTAG, &dbp);
 		if (err != 0)
 			return (err);
 
 		/* Try to read the object to see if it needs healing */
 		err = dbuf_read((dmu_buf_impl_t *)dbp, NULL, flags);
 		/*
 		 * We only try to heal when dbuf_read() returns a ECKSUMs.
 		 * Other errors (even EIO) get returned to caller.
 		 * EIO indicates that the device is not present/accessible,
 		 * so writing to it will likely fail.
 		 * If the block is healthy, we don't want to overwrite it
 		 * unnecessarily.
 		 */
 		if (err != ECKSUM) {
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Make sure the on-disk block and recv record sizes match */
 		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
 
 	/*
 	 * For resuming to work, records must be in increasing order
 	 * by (object, offset).
 	 */
 	if (drrw->drr_object < rwa->last_object ||
 	    (drrw->drr_object == rwa->last_object &&
 	    drrw->drr_offset < rwa->last_offset)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
 	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
 	uint64_t batch_size =
 	    MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2);
 	if (first_rrd != NULL &&
 	    (drrw->drr_object != first_drrw->drr_object ||
 	    drrw->drr_offset >= first_drrw->drr_offset + batch_size)) {
 		err = flush_write_batch(rwa);
 		if (err != 0)
 			return (err);
 	}
 
 	rwa->last_object = drrw->drr_object;
 	rwa->last_offset = drrw->drr_offset;
 
 	if (rwa->last_object > rwa->max_object)
 		rwa->max_object = rwa->last_object;
 
 	list_insert_tail(&rwa->write_batch, rrd);
 	/*
 	 * Return EAGAIN to indicate that we will use this rrd again,
 	 * so the caller should not free it
 	 */
 	return (EAGAIN);
 }
 
 static int
 receive_write_embedded(struct receive_writer_arg *rwa,
     struct drr_write_embedded *drrwe, void *data)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
 		return (SET_ERROR(EINVAL));
 	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
 		return (SET_ERROR(EINVAL));
 	if (rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drrwe->drr_object > rwa->max_object)
 		rwa->max_object = drrwe->drr_object;
 
 	tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_write(tx, drrwe->drr_object,
 	    drrwe->drr_offset, drrwe->drr_length);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	dmu_write_embedded(rwa->os, drrwe->drr_object,
 	    drrwe->drr_offset, data, drrwe->drr_etype,
 	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
 	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
 
 	/* See comment in restore_write. */
 	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 static int
 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
     abd_t *abd)
 {
 	dmu_buf_t *db, *db_spill;
 	int err;
 
 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
 	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * This is an unmodified spill block which was added to the stream
 	 * to resolve an issue with incorrectly removing spill blocks.  It
 	 * should be ignored by current versions of the code which support
 	 * the DRR_FLAG_SPILL_BLOCK flag.
 	 */
 	if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
 		abd_free(abd);
 		return (0);
 	}
 
 	if (rwa->raw) {
 		if (!DMU_OT_IS_VALID(drrs->drr_type) ||
 		    drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
 		    drrs->drr_compressed_size == 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrs->drr_object > rwa->max_object)
 		rwa->max_object = drrs->drr_object;
 
 	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
 	if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
 	    &db_spill)) != 0) {
 		dmu_buf_rele(db, FTAG);
 		return (err);
 	}
 
 	dmu_tx_t *tx = dmu_tx_create(rwa->os);
 
 	dmu_tx_hold_spill(tx, db->db_object);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_buf_rele(db, FTAG);
 		dmu_buf_rele(db_spill, FTAG);
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	/*
 	 * Spill blocks may both grow and shrink.  When a change in size
 	 * occurs any existing dbuf must be updated to match the logical
 	 * size of the provided arc_buf_t.
 	 */
 	if (db_spill->db_size != drrs->drr_length) {
 		dmu_buf_will_fill(db_spill, tx, B_FALSE);
 		VERIFY0(dbuf_spill_set_blksz(db_spill,
 		    drrs->drr_length, tx));
 	}
 
 	arc_buf_t *abuf;
 	if (rwa->raw) {
 		boolean_t byteorder = ZFS_HOST_BYTEORDER ^
 		    !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
 		    rwa->byteswap;
 
 		abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
 		    drrs->drr_object, byteorder, drrs->drr_salt,
 		    drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
 		    drrs->drr_compressed_size, drrs->drr_length,
 		    drrs->drr_compressiontype, 0);
 	} else {
 		abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
 		    DMU_OT_IS_METADATA(drrs->drr_type),
 		    drrs->drr_length);
 		if (rwa->byteswap) {
 			dmu_object_byteswap_t byteswap =
 			    DMU_OT_BYTESWAP(drrs->drr_type);
 			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
 			    DRR_SPILL_PAYLOAD_SIZE(drrs));
 		}
 	}
 
 	memcpy(abuf->b_data, abd_to_buf(abd), DRR_SPILL_PAYLOAD_SIZE(drrs));
 	abd_free(abd);
 	dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
 
 	dmu_buf_rele(db, FTAG);
 	dmu_buf_rele(db_spill, FTAG);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 noinline static int
 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
 {
 	int err;
 
 	if (drrf->drr_length != -1ULL &&
 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
 		return (SET_ERROR(EINVAL));
 
 	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (drrf->drr_object > rwa->max_object)
 		rwa->max_object = drrf->drr_object;
 
 	err = dmu_free_long_range(rwa->os, drrf->drr_object,
 	    drrf->drr_offset, drrf->drr_length);
 
 	return (err);
 }
 
 static int
 receive_object_range(struct receive_writer_arg *rwa,
     struct drr_object_range *drror)
 {
 	/*
 	 * By default, we assume this block is in our native format
 	 * (ZFS_HOST_BYTEORDER). We then take into account whether
 	 * the send stream is byteswapped (rwa->byteswap). Finally,
 	 * we need to byteswap again if this particular block was
 	 * in non-native format on the send side.
 	 */
 	boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
 	    !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
 
 	/*
 	 * Since dnode block sizes are constant, we should not need to worry
 	 * about making sure that the dnode block size is the same on the
 	 * sending and receiving sides for the time being. For non-raw sends,
 	 * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
 	 * record at all). Raw sends require this record type because the
 	 * encryption parameters are used to protect an entire block of bonus
 	 * buffers. If the size of dnode blocks ever becomes variable,
 	 * handling will need to be added to ensure that dnode block sizes
 	 * match on the sending and receiving side.
 	 */
 	if (drror->drr_numslots != DNODES_PER_BLOCK ||
 	    P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
 	    !rwa->raw)
 		return (SET_ERROR(EINVAL));
 
 	if (drror->drr_firstobj > rwa->max_object)
 		rwa->max_object = drror->drr_firstobj;
 
 	/*
 	 * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
 	 * so that the block of dnodes is not written out when it's empty,
 	 * and converted to a HOLE BP.
 	 */
 	rwa->or_crypt_params_present = B_TRUE;
 	rwa->or_firstobj = drror->drr_firstobj;
 	rwa->or_numslots = drror->drr_numslots;
 	memcpy(rwa->or_salt, drror->drr_salt, ZIO_DATA_SALT_LEN);
 	memcpy(rwa->or_iv, drror->drr_iv, ZIO_DATA_IV_LEN);
 	memcpy(rwa->or_mac, drror->drr_mac, ZIO_DATA_MAC_LEN);
 	rwa->or_byteorder = byteorder;
 
 	rwa->or_need_sync = ORNS_MAYBE;
 
 	return (0);
 }
 
 /*
  * Until we have the ability to redact large ranges of data efficiently, we
  * process these records as frees.
  */
 noinline static int
 receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
 {
 	struct drr_free drrf = {0};
 	drrf.drr_length = drrr->drr_length;
 	drrf.drr_object = drrr->drr_object;
 	drrf.drr_offset = drrr->drr_offset;
 	drrf.drr_toguid = drrr->drr_toguid;
 	return (receive_free(rwa, &drrf));
 }
 
 /* used to destroy the drc_ds on error */
 static void
 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
 {
 	dsl_dataset_t *ds = drc->drc_ds;
 	ds_hold_flags_t dsflags;
 
 	dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
 	/*
 	 * Wait for the txg sync before cleaning up the receive. For
 	 * resumable receives, this ensures that our resume state has
 	 * been written out to disk. For raw receives, this ensures
 	 * that the user accounting code will not attempt to do anything
 	 * after we stopped receiving the dataset.
 	 */
 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
 	ds->ds_objset->os_raw_receive = B_FALSE;
 
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	if (drc->drc_resumable && drc->drc_should_save &&
 	    !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 	} else {
 		char name[ZFS_MAX_DATASET_NAME_LEN];
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 		dsl_dataset_name(ds, name);
 		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
 		if (!drc->drc_heal)
 			(void) dsl_destroy_head(name);
 	}
 }
 
 static void
 receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	if (drc->drc_byteswap) {
 		(void) fletcher_4_incremental_byteswap(buf, len,
 		    &drc->drc_cksum);
 	} else {
 		(void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
 	}
 }
 
 /*
  * Read the payload into a buffer of size len, and update the current record's
  * payload field.
  * Allocate drc->drc_next_rrd and read the next record's header into
  * drc->drc_next_rrd->header.
  * Verify checksum of payload and next record.
  */
 static int
 receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
 {
 	int err;
 
 	if (len != 0) {
 		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
 		err = receive_read(drc, len, buf);
 		if (err != 0)
 			return (err);
 		receive_cksum(drc, len, buf);
 
 		/* note: rrd is NULL when reading the begin record's payload */
 		if (drc->drc_rrd != NULL) {
 			drc->drc_rrd->payload = buf;
 			drc->drc_rrd->payload_size = len;
 			drc->drc_rrd->bytes_read = drc->drc_bytes_read;
 		}
 	} else {
 		ASSERT3P(buf, ==, NULL);
 	}
 
 	drc->drc_prev_cksum = drc->drc_cksum;
 
 	drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
 	err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
 	    &drc->drc_next_rrd->header);
 	drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
 
 	if (err != 0) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (err);
 	}
 	if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Note: checksum is of everything up to but not including the
 	 * checksum itself.
 	 */
 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 	receive_cksum(drc,
 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 	    &drc->drc_next_rrd->header);
 
 	zio_cksum_t cksum_orig =
 	    drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 	zio_cksum_t *cksump =
 	    &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
 
 	if (drc->drc_byteswap)
 		byteswap_record(&drc->drc_next_rrd->header);
 
 	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
 	    !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 		drc->drc_next_rrd = NULL;
 		return (SET_ERROR(ECKSUM));
 	}
 
 	receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
 
 	return (0);
 }
 
 /*
  * Issue the prefetch reads for any necessary indirect blocks.
  *
  * We use the object ignore list to tell us whether or not to issue prefetches
  * for a given object.  We do this for both correctness (in case the blocksize
  * of an object has changed) and performance (if the object doesn't exist, don't
  * needlessly try to issue prefetches).  We also trim the list as we go through
  * the stream to prevent it from growing to an unbounded size.
  *
  * The object numbers within will always be in sorted order, and any write
  * records we see will also be in sorted order, but they're not sorted with
  * respect to each other (i.e. we can get several object records before
  * receiving each object's write records).  As a result, once we've reached a
  * given object number, we can safely remove any reference to lower object
  * numbers in the ignore list. In practice, we receive up to 32 object records
  * before receiving write records, so the list can have up to 32 nodes in it.
  */
 static void
 receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
     uint64_t length)
 {
 	if (!objlist_exists(drc->drc_ignore_objlist, object)) {
 		dmu_prefetch(drc->drc_os, object, 1, offset, length,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 }
 
 /*
  * Read records off the stream, issuing any necessary prefetches.
  */
 static int
 receive_read_record(dmu_recv_cookie_t *drc)
 {
 	int err;
 
 	switch (drc->drc_rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro =
 		    &drc->drc_rrd->header.drr_u.drr_object;
 		uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
 		void *buf = NULL;
 		dmu_object_info_t doi;
 
 		if (size != 0)
 			buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 		err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
 		/*
 		 * See receive_read_prefetch for an explanation why we're
 		 * storing this object in the ignore_obj_list.
 		 */
 		if (err == ENOENT || err == EEXIST ||
 		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
 			objlist_insert(drc->drc_ignore_objlist,
 			    drro->drr_object);
 			err = 0;
 		}
 		return (err);
 	}
 	case DRR_FREEOBJECTS:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
 		int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0) {
 			abd_free(abd);
 			return (err);
 		}
 		drc->drc_rrd->abd = abd;
 		receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
 		    drrw->drr_logical_size);
 		return (err);
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &drc->drc_rrd->header.drr_u.drr_write_embedded;
 		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
 		void *buf = kmem_zalloc(size, KM_SLEEP);
 
 		err = receive_read_payload_and_next_header(drc, size, buf);
 		if (err != 0) {
 			kmem_free(buf, size);
 			return (err);
 		}
 
 		receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
 		    drrwe->drr_length);
 		return (err);
 	}
 	case DRR_FREE:
 	case DRR_REDACT:
 	{
 		/*
 		 * It might be beneficial to prefetch indirect blocks here, but
 		 * we don't really have the data to decide for sure.
 		 */
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 	}
 	case DRR_END:
 	{
 		struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
 		if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
 		    drre->drr_checksum))
 			return (SET_ERROR(ECKSUM));
 		return (0);
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
 		int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
 		abd_t *abd = abd_alloc_linear(size, B_FALSE);
 		err = receive_read_payload_and_next_header(drc, size,
 		    abd_to_buf(abd));
 		if (err != 0)
 			abd_free(abd);
 		else
 			drc->drc_rrd->abd = abd;
 		return (err);
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		return (err);
 
 	}
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 }
 
 
 
 static void
 dprintf_drr(struct receive_record_arg *rrd, int err)
 {
 #ifdef ZFS_DEBUG
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		dprintf("drr_type = OBJECT obj = %llu type = %u "
 		    "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
 		    "compress = %u dn_slots = %u err = %d\n",
 		    (u_longlong_t)drro->drr_object, drro->drr_type,
 		    drro->drr_bonustype, drro->drr_blksz, drro->drr_bonuslen,
 		    drro->drr_checksumtype, drro->drr_compress,
 		    drro->drr_dn_slots, err);
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		dprintf("drr_type = FREEOBJECTS firstobj = %llu "
 		    "numobjs = %llu err = %d\n",
 		    (u_longlong_t)drrfo->drr_firstobj,
 		    (u_longlong_t)drrfo->drr_numobjs, err);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
 		dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
 		    "lsize = %llu cksumtype = %u flags = %u "
 		    "compress = %u psize = %llu err = %d\n",
 		    (u_longlong_t)drrw->drr_object, drrw->drr_type,
 		    (u_longlong_t)drrw->drr_offset,
 		    (u_longlong_t)drrw->drr_logical_size,
 		    drrw->drr_checksumtype, drrw->drr_flags,
 		    drrw->drr_compressiontype,
 		    (u_longlong_t)drrw->drr_compressed_size, err);
 		break;
 	}
 	case DRR_WRITE_BYREF:
 	{
 		struct drr_write_byref *drrwbr =
 		    &rrd->header.drr_u.drr_write_byref;
 		dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
 		    "length = %llu toguid = %llx refguid = %llx "
 		    "refobject = %llu refoffset = %llu cksumtype = %u "
 		    "flags = %u err = %d\n",
 		    (u_longlong_t)drrwbr->drr_object,
 		    (u_longlong_t)drrwbr->drr_offset,
 		    (u_longlong_t)drrwbr->drr_length,
 		    (u_longlong_t)drrwbr->drr_toguid,
 		    (u_longlong_t)drrwbr->drr_refguid,
 		    (u_longlong_t)drrwbr->drr_refobject,
 		    (u_longlong_t)drrwbr->drr_refoffset,
 		    drrwbr->drr_checksumtype, drrwbr->drr_flags, err);
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
 		    "length = %llu compress = %u etype = %u lsize = %u "
 		    "psize = %u err = %d\n",
 		    (u_longlong_t)drrwe->drr_object,
 		    (u_longlong_t)drrwe->drr_offset,
 		    (u_longlong_t)drrwe->drr_length,
 		    drrwe->drr_compression, drrwe->drr_etype,
 		    drrwe->drr_lsize, drrwe->drr_psize, err);
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		dprintf("drr_type = FREE obj = %llu offset = %llu "
 		    "length = %lld err = %d\n",
 		    (u_longlong_t)drrf->drr_object,
 		    (u_longlong_t)drrf->drr_offset,
 		    (longlong_t)drrf->drr_length,
 		    err);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		dprintf("drr_type = SPILL obj = %llu length = %llu "
 		    "err = %d\n", (u_longlong_t)drrs->drr_object,
 		    (u_longlong_t)drrs->drr_length, err);
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
 		    "numslots = %llu flags = %u err = %d\n",
 		    (u_longlong_t)drror->drr_firstobj,
 		    (u_longlong_t)drror->drr_numslots,
 		    drror->drr_flags, err);
 		break;
 	}
 	default:
 		return;
 	}
 #endif
 }
 
 /*
  * Commit the records to the pool.
  */
 static int
 receive_process_record(struct receive_writer_arg *rwa,
     struct receive_record_arg *rrd)
 {
 	int err;
 
 	/* Processing in order, therefore bytes_read should be increasing. */
 	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
 	rwa->bytes_read = rrd->bytes_read;
 
 	/* We can only heal write records; other ones get ignored */
 	if (rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		return (0);
 	}
 
 	if (!rwa->heal && rrd->header.drr_type != DRR_WRITE) {
 		err = flush_write_batch(rwa);
 		if (err != 0) {
 			if (rrd->abd != NULL) {
 				abd_free(rrd->abd);
 				rrd->abd = NULL;
 				rrd->payload = NULL;
 			} else if (rrd->payload != NULL) {
 				kmem_free(rrd->payload, rrd->payload_size);
 				rrd->payload = NULL;
 			}
 
 			return (err);
 		}
 	}
 
 	switch (rrd->header.drr_type) {
 	case DRR_OBJECT:
 	{
 		struct drr_object *drro = &rrd->header.drr_u.drr_object;
 		err = receive_object(rwa, drro, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREEOBJECTS:
 	{
 		struct drr_freeobjects *drrfo =
 		    &rrd->header.drr_u.drr_freeobjects;
 		err = receive_freeobjects(rwa, drrfo);
 		break;
 	}
 	case DRR_WRITE:
 	{
 		err = receive_process_write_record(rwa, rrd);
 		if (rwa->heal) {
 			/*
 			 * If healing - always free the abd after processing
 			 */
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		} else if (err != EAGAIN) {
 			/*
 			 * On success, a non-healing
 			 * receive_process_write_record() returns
 			 * EAGAIN to indicate that we do not want to free
 			 * the rrd or arc_buf.
 			 */
 			ASSERT(err != 0);
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 		}
 		break;
 	}
 	case DRR_WRITE_EMBEDDED:
 	{
 		struct drr_write_embedded *drrwe =
 		    &rrd->header.drr_u.drr_write_embedded;
 		err = receive_write_embedded(rwa, drrwe, rrd->payload);
 		kmem_free(rrd->payload, rrd->payload_size);
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_FREE:
 	{
 		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
 		err = receive_free(rwa, drrf);
 		break;
 	}
 	case DRR_SPILL:
 	{
 		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
 		err = receive_spill(rwa, drrs, rrd->abd);
 		if (err != 0)
 			abd_free(rrd->abd);
 		rrd->abd = NULL;
 		rrd->payload = NULL;
 		break;
 	}
 	case DRR_OBJECT_RANGE:
 	{
 		struct drr_object_range *drror =
 		    &rrd->header.drr_u.drr_object_range;
 		err = receive_object_range(rwa, drror);
 		break;
 	}
 	case DRR_REDACT:
 	{
 		struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
 		err = receive_redact(rwa, drrr);
 		break;
 	}
 	default:
 		err = (SET_ERROR(EINVAL));
 	}
 
 	if (err != 0)
 		dprintf_drr(rrd, err);
 
 	return (err);
 }
 
 /*
  * dmu_recv_stream's worker thread; pull records off the queue, and then call
  * receive_process_record  When we're done, signal the main thread and exit.
  */
 static __attribute__((noreturn)) void
 receive_writer_thread(void *arg)
 {
 	struct receive_writer_arg *rwa = arg;
 	struct receive_record_arg *rrd;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
 	    rrd = bqueue_dequeue(&rwa->q)) {
 		/*
 		 * If there's an error, the main thread will stop putting things
 		 * on the queue, but we need to clear everything in it before we
 		 * can exit.
 		 */
 		int err = 0;
 		if (rwa->err == 0) {
 			err = receive_process_record(rwa, rrd);
 		} else if (rrd->abd != NULL) {
 			abd_free(rrd->abd);
 			rrd->abd = NULL;
 			rrd->payload = NULL;
 		} else if (rrd->payload != NULL) {
 			kmem_free(rrd->payload, rrd->payload_size);
 			rrd->payload = NULL;
 		}
 		/*
 		 * EAGAIN indicates that this record has been saved (on
 		 * raw->write_batch), and will be used again, so we don't
 		 * free it.
 		 * When healing data we always need to free the record.
 		 */
 		if (err != EAGAIN || rwa->heal) {
 			if (rwa->err == 0)
 				rwa->err = err;
 			kmem_free(rrd, sizeof (*rrd));
 		}
 	}
 	kmem_free(rrd, sizeof (*rrd));
 
 	if (rwa->heal) {
 		zio_wait(rwa->heal_pio);
 	} else {
 		int err = flush_write_batch(rwa);
 		if (rwa->err == 0)
 			rwa->err = err;
 	}
 	mutex_enter(&rwa->mutex);
 	rwa->done = B_TRUE;
 	cv_signal(&rwa->cv);
 	mutex_exit(&rwa->mutex);
 	spl_fstrans_unmark(cookie);
 	thread_exit();
 }
 
 static int
 resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
 {
 	uint64_t val;
 	objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
 	uint64_t dsobj = dmu_objset_id(drc->drc_os);
 	uint64_t resume_obj, resume_off;
 
 	if (nvlist_lookup_uint64(begin_nvl,
 	    "resume_object", &resume_obj) != 0 ||
 	    nvlist_lookup_uint64(begin_nvl,
 	    "resume_offset", &resume_off) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
 	if (resume_obj != val)
 		return (SET_ERROR(EINVAL));
 	VERIFY0(zap_lookup(mos, dsobj,
 	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
 	if (resume_off != val)
 		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
  * Read in the stream's records, one by one, and apply them to the pool.  There
  * are two threads involved; the thread that calls this function will spin up a
  * worker thread, read the records off the stream one by one, and issue
  * prefetches for any necessary indirect blocks.  It will then push the records
  * onto an internal blocking queue.  The worker thread will pull the records off
  * the queue, and actually write the data into the DMU.  This way, the worker
  * thread doesn't have to wait for reads to complete, since everything it needs
  * (the indirect blocks) will be prefetched.
  *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
 dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
 {
 	int err = 0;
 	struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
 
 	if (dsl_dataset_has_resume_receive_state(drc->drc_ds)) {
 		uint64_t bytes = 0;
 		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
 		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
 		    sizeof (bytes), 1, &bytes);
 		drc->drc_bytes_read += bytes;
 	}
 
 	drc->drc_ignore_objlist = objlist_create();
 
 	/* these were verified in dmu_recv_begin */
 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 	ASSERT0(drc->drc_os->os_encrypted &&
 	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
 
 	/* handle DSL encryption key payload */
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
 		nvlist_t *keynvl = NULL;
 
 		ASSERT(drc->drc_os->os_encrypted);
 		ASSERT(drc->drc_raw);
 
 		err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
 		    &keynvl);
 		if (err != 0)
 			goto out;
 
 		if (!drc->drc_heal) {
 			/*
 			 * If this is a new dataset we set the key immediately.
 			 * Otherwise we don't want to change the key until we
 			 * are sure the rest of the receive succeeded so we
 			 * stash the keynvl away until then.
 			 */
 			err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
 			    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
 			    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
 			if (err != 0)
 				goto out;
 		}
 
 		/* see comment in dmu_recv_end_sync() */
 		drc->drc_ivset_guid = 0;
 		(void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
 		    &drc->drc_ivset_guid);
 
 		if (!drc->drc_newfs)
 			drc->drc_keynvl = fnvlist_dup(keynvl);
 	}
 
 	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
 		err = resume_check(drc, drc->drc_begin_nvl);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * For compatibility with recursive send streams, we do this here,
 	 * rather than in dmu_recv_begin. If we pull the next header too
 	 * early, and it's the END record, we break the `recv_skip` logic.
 	 */
 	if (drc->drc_drr_begin->drr_payloadlen == 0) {
 		err = receive_read_payload_and_next_header(drc, 0, NULL);
 		if (err != 0)
 			goto out;
 	}
 
 	/*
 	 * If we failed before this point we will clean up any new resume
 	 * state that was created. Now that we've gotten past the initial
 	 * checks we are ok to retain that resume state.
 	 */
 	drc->drc_should_save = B_TRUE;
 
 	(void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
 	    MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
 	    offsetof(struct receive_record_arg, node));
 	cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
 	rwa->os = drc->drc_os;
 	rwa->byteswap = drc->drc_byteswap;
 	rwa->heal = drc->drc_heal;
 	rwa->tofs = drc->drc_tofs;
 	rwa->resumable = drc->drc_resumable;
 	rwa->raw = drc->drc_raw;
 	rwa->spill = drc->drc_spill;
 	rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
 	rwa->os->os_raw_receive = drc->drc_raw;
 	if (drc->drc_heal) {
 		rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL,
 		    ZIO_FLAG_GODFATHER);
 	}
 	list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
 	    offsetof(struct receive_record_arg, node.bqn_node));
 
 	(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
 	    TS_RUN, minclsyspri);
 	/*
 	 * We're reading rwa->err without locks, which is safe since we are the
 	 * only reader, and the worker thread is the only writer.  It's ok if we
 	 * miss a write for an iteration or two of the loop, since the writer
 	 * thread will keep freeing records we send it until we send it an eos
 	 * marker.
 	 *
 	 * We can leave this loop in 3 ways:  First, if rwa->err is
 	 * non-zero.  In that case, the writer thread will free the rrd we just
 	 * pushed.  Second, if  we're interrupted; in that case, either it's the
 	 * first loop and drc->drc_rrd was never allocated, or it's later, and
 	 * drc->drc_rrd has been handed off to the writer thread who will free
 	 * it.  Finally, if receive_read_record fails or we're at the end of the
 	 * stream, then we free drc->drc_rrd and exit.
 	 */
 	while (rwa->err == 0) {
 		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 
 		ASSERT3P(drc->drc_rrd, ==, NULL);
 		drc->drc_rrd = drc->drc_next_rrd;
 		drc->drc_next_rrd = NULL;
 		/* Allocates and loads header into drc->drc_next_rrd */
 		err = receive_read_record(drc);
 
 		if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
 			kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
 			drc->drc_rrd = NULL;
 			break;
 		}
 
 		bqueue_enqueue(&rwa->q, drc->drc_rrd,
 		    sizeof (struct receive_record_arg) +
 		    drc->drc_rrd->payload_size);
 		drc->drc_rrd = NULL;
 	}
 
 	ASSERT3P(drc->drc_rrd, ==, NULL);
 	drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
 	drc->drc_rrd->eos_marker = B_TRUE;
 	bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
 
 	mutex_enter(&rwa->mutex);
 	while (!rwa->done) {
 		/*
 		 * We need to use cv_wait_sig() so that any process that may
 		 * be sleeping here can still fork.
 		 */
 		(void) cv_wait_sig(&rwa->cv, &rwa->mutex);
 	}
 	mutex_exit(&rwa->mutex);
 
 	/*
 	 * If we are receiving a full stream as a clone, all object IDs which
 	 * are greater than the maximum ID referenced in the stream are
 	 * by definition unused and must be freed.
 	 */
 	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
 		uint64_t obj = rwa->max_object + 1;
 		int free_err = 0;
 		int next_err = 0;
 
 		while (next_err == 0) {
 			free_err = dmu_free_long_object(rwa->os, obj);
 			if (free_err != 0 && free_err != ENOENT)
 				break;
 
 			next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
 		}
 
 		if (err == 0) {
 			if (free_err != 0 && free_err != ENOENT)
 				err = free_err;
 			else if (next_err != ESRCH)
 				err = next_err;
 		}
 	}
 
 	cv_destroy(&rwa->cv);
 	mutex_destroy(&rwa->mutex);
 	bqueue_destroy(&rwa->q);
 	list_destroy(&rwa->write_batch);
 	if (err == 0)
 		err = rwa->err;
 
 out:
 	/*
 	 * If we hit an error before we started the receive_writer_thread
 	 * we need to clean up the next_rrd we create by processing the
 	 * DRR_BEGIN record.
 	 */
 	if (drc->drc_next_rrd != NULL)
 		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
 
 	/*
 	 * The objset will be invalidated by dmu_recv_end() when we do
 	 * dsl_dataset_clone_swap_sync_impl().
 	 */
 	drc->drc_os = NULL;
 
 	kmem_free(rwa, sizeof (*rwa));
 	nvlist_free(drc->drc_begin_nvl);
 
 	if (err != 0) {
 		/*
 		 * Clean up references. If receive is not resumable,
 		 * destroy what we created, so we don't leave it in
 		 * the inconsistent state.
 		 */
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	}
 
 	objlist_destroy(drc->drc_ignore_objlist);
 	drc->drc_ignore_objlist = NULL;
 	*voffp = drc->drc_voff;
 	return (err);
 }
 
 static int
 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	int error;
 
 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
 
 	if (drc->drc_heal) {
 		error = 0;
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
 		if (error != 0)
 			return (error);
 		if (drc->drc_force) {
 			/*
 			 * We will destroy any snapshots in tofs (i.e. before
 			 * origin_head) that are after the origin (which is
 			 * the snap before drc_ds, because drc_ds can not
 			 * have any snaps of its own).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap);
 				if (error != 0)
 					break;
 				if (snap->ds_dir != origin_head->ds_dir)
 					error = SET_ERROR(EINVAL);
 				if (error == 0)  {
 					error = dsl_destroy_snapshot_check_impl(
 					    snap, B_FALSE);
 				}
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_dataset_rele(snap, FTAG);
 				if (error != 0)
 					break;
 			}
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			if (error != 0) {
 				dsl_dataset_rele(origin_head, FTAG);
 				return (error);
 			}
 		}
 
 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
 		    origin_head, drc->drc_force, drc->drc_owner, tx);
 		if (error != 0) {
 			dsl_dataset_rele(origin_head, FTAG);
 			return (error);
 		}
 		error = dsl_dataset_snapshot_check_impl(origin_head,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 		dsl_dataset_rele(origin_head, FTAG);
 		if (error != 0)
 			return (error);
 
 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
 	} else {
 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
 		    drc->drc_tosnap, tx, B_TRUE, 1,
 		    drc->drc_cred, drc->drc_proc);
 	}
 	return (error);
 }
 
 static void
 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_recv_cookie_t *drc = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
 	uint64_t newsnapobj = 0;
 
 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
 	    tx, "snap=%s", drc->drc_tosnap);
 	drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
 
 	if (drc->drc_heal) {
 		if (drc->drc_keynvl != NULL) {
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 	} else if (!drc->drc_newfs) {
 		dsl_dataset_t *origin_head;
 
 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
 		    &origin_head));
 
 		if (drc->drc_force) {
 			/*
 			 * Destroy any snapshots of drc_tofs (origin_head)
 			 * after the origin (the snap before drc_ds).
 			 */
 			uint64_t obj;
 
 			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 			while (obj !=
 			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
 				dsl_dataset_t *snap;
 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
 				    &snap));
 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
 				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 				dsl_destroy_snapshot_sync_impl(snap,
 				    B_FALSE, tx);
 				dsl_dataset_rele(snap, FTAG);
 			}
 		}
 		if (drc->drc_keynvl != NULL) {
 			dsl_crypto_recv_raw_key_sync(drc->drc_ds,
 			    drc->drc_keynvl, tx);
 			nvlist_free(drc->drc_keynvl);
 			drc->drc_keynvl = NULL;
 		}
 
 		VERIFY3P(drc->drc_ds->ds_prev, ==,
 		    origin_head->ds_prev);
 
 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
 		    origin_head, tx);
 		/*
 		 * The objset was evicted by dsl_dataset_clone_swap_sync_impl,
 		 * so drc_os is no longer valid.
 		 */
 		drc->drc_os = NULL;
 
 		dsl_dataset_snapshot_sync_impl(origin_head,
 		    drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 		dsl_dataset_phys(origin_head)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		newsnapobj =
 		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
 
 		dsl_dataset_rele(origin_head, FTAG);
 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
 
 		if (drc->drc_owner != NULL)
 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
 	} else {
 		dsl_dataset_t *ds = drc->drc_ds;
 
 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
 
 		/* set snapshot's creation time and guid */
 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
 		    drc->drc_drrb->drr_creation_time;
 		dsl_dataset_phys(ds->ds_prev)->ds_guid =
 		    drc->drc_drrb->drr_toguid;
 		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
 		    ~DS_FLAG_INCONSISTENT;
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 		if (dsl_dataset_has_resume_receive_state(ds)) {
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_FROMGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OBJECT, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_OFFSET, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_BYTES, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TOGUID, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_TONAME, tx);
 			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
 			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
 		}
 		newsnapobj =
 		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
 	}
 
 	/*
 	 * If this is a raw receive, the crypt_keydata nvlist will include
 	 * a to_ivset_guid for us to set on the new snapshot. This value
 	 * will override the value generated by the snapshot code. However,
 	 * this value may not be present, because older implementations of
 	 * the raw send code did not include this value, and we are still
 	 * allowed to receive them if the zfs_disable_ivset_guid_check
 	 * tunable is set, in which case we will leave the newly-generated
 	 * value.
 	 */
 	if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) {
 		dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
 		    DMU_OT_DSL_DATASET, tx);
 		VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
 		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
 		    &drc->drc_ivset_guid, tx));
 	}
 
 	/*
 	 * Release the hold from dmu_recv_begin.  This must be done before
 	 * we return to open context, so that when we free the dataset's dnode
 	 * we can evict its bonus buffer. Since the dataset may be destroyed
 	 * at this point (and therefore won't have a valid pointer to the spa)
 	 * we release the key mapping manually here while we do have a valid
 	 * pointer, if it exists.
 	 */
 	if (!drc->drc_raw && encrypted) {
 		(void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
 		    drc->drc_ds->ds_object, drc->drc_ds);
 	}
 	dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
 	drc->drc_ds = NULL;
 }
 
 static int dmu_recv_end_modified_blocks = 3;
 
 static int
 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 #ifdef _KERNEL
 	/*
 	 * We will be destroying the ds; make sure its origin is unmounted if
 	 * necessary.
 	 */
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(drc->drc_ds, name);
 	zfs_destroy_unmount_origin(name);
 #endif
 
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dmu_recv_new_end(dmu_recv_cookie_t *drc)
 {
 	return (dsl_sync_task(drc->drc_tofs,
 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
 	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
 	int error;
 
 	drc->drc_owner = owner;
 
 	if (drc->drc_newfs)
 		error = dmu_recv_new_end(drc);
 	else
 		error = dmu_recv_existing_end(drc);
 
 	if (error != 0) {
 		dmu_recv_cleanup_ds(drc);
 		nvlist_free(drc->drc_keynvl);
 	} else if (!drc->drc_heal) {
 		if (drc->drc_newfs) {
 			zvol_create_minor(drc->drc_tofs);
 		}
 		char *snapname = kmem_asprintf("%s@%s",
 		    drc->drc_tofs, drc->drc_tosnap);
 		zvol_create_minor(snapname);
 		kmem_strfree(snapname);
 	}
 	return (error);
 }
 
 /*
  * Return TRUE if this objset is currently being received into.
  */
 boolean_t
 dmu_objset_is_receiving(objset_t *os)
 {
 	return (os->os_dsl_dataset != NULL &&
 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, UINT, ZMOD_RW,
 	"Maximum receive queue length");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, UINT, ZMOD_RW,
 	"Receive queue fill fraction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, UINT, ZMOD_RW,
 	"Maximum amount of writes to batch into one transaction");
 
 ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW,
 	"Ignore errors during corrective receive");
-/* END CSTYLED */
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 15cc2885e805..aa0434f3c722 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -1,823 +1,822 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
 #include <sys/zfeature.h>
 
 static int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
 static int32_t send_holes_without_birth_time = 1;
 static uint_t zfs_traverse_indirect_prefetch_limit = 32;
 
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
 	int32_t pd_bytes_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
 	zbookmark_phys_t pd_resume;
 } prefetch_data_t;
 
 typedef struct traverse_data {
 	spa_t *td_spa;
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
 	zbookmark_phys_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	boolean_t td_paused;
 	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 	boolean_t td_realloc_possible;
 } traverse_data_t;
 
 static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
     const dnode_phys_t *dnp, uint64_t objset, uint64_t object);
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
     uint64_t objset, uint64_t object);
 
 static int
 traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 	zbookmark_phys_t zb;
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	if (claim_txg == 0 &&
 	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
 
 	return (0);
 }
 
 static int
 traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		if (BP_IS_HOLE(bp))
 			return (0);
 
 		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 		    td->td_arg);
 	}
 	return (0);
 }
 
 static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed; plus blocks that are already stable in read-only mode.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
 	zil_free(zilog);
 }
 
 typedef enum resume_skip {
 	RESUME_SKIP_ALL,
 	RESUME_SKIP_NONE,
 	RESUME_SKIP_CHILDREN
 } resume_skip_t;
 
 /*
  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
  * the block indicated by zb does not need to be visited at all. Returns
  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
  * resume point. This indicates that this block should be visited but not its
  * children (since they must have been visited in a previous traversal).
  * Otherwise returns RESUME_SKIP_NONE.
  */
 static resume_skip_t
 resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	if (td->td_resume != NULL) {
 		/*
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			if (td->td_flags & TRAVERSE_POST)
 				return (RESUME_SKIP_CHILDREN);
 		}
 	}
 	return (RESUME_SKIP_NONE);
 }
 
 /*
  * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
  */
 static boolean_t
 traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_PRESCIENT_PREFETCH;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return (B_FALSE);
 	/*
 	 * If this bp is before the resume point, it may have already been
 	 * freed.
 	 */
 	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
 	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
 
 	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 	return (B_TRUE);
 }
 
 static boolean_t
 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 {
 	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))
 		return (B_FALSE);
 	return (B_TRUE);
 }
 
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
 		return (0);
 	case RESUME_SKIP_CHILDREN:
 		goto post;
 	case RESUME_SKIP_NONE:
 		break;
 	default:
 		ASSERT(0);
 	}
 
 	if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
 		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
 		 * which has always been a hole in an object.
 		 *
 		 * If a file is written sparsely, then the unwritten parts of
 		 * the file were "always holes" -- that is, they have been
 		 * holes since this object was allocated.  However, we (and
 		 * our callers) can not necessarily tell when an object was
 		 * allocated.  Therefore, if it's possible that this object
 		 * was freed and then its object number reused, we need to
 		 * visit all the holes with birth==0.
 		 *
 		 * If it isn't possible that the object number was reused,
 		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
 		 * all the blocks we will visit as part of this traversal,
 		 * then this hole must have always existed, so we can skip
 		 * it.  We visit blocks born after (exclusive) td_min_txg.
 		 *
 		 * Note that the meta-dnode cannot be reallocated.
 		 */
 		if (!send_holes_without_birth_time &&
 		    (!td->td_realloc_possible ||
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
 	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
 		return (0);
 	}
 
 	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 		uint64_t size = BP_GET_LSIZE(bp);
 		mutex_enter(&pd->pd_mtx);
 		ASSERT(pd->pd_bytes_fetched >= 0);
 		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 			cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
 		pd->pd_bytes_fetched -= size;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err != 0)
 			goto post;
 		return (0);
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_FLAG_WAIT;
 		int32_t i, ptidx, pidx;
 		uint32_t prefetchlimit;
 		int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		zbookmark_phys_t *czb;
 
 		ASSERT(!BP_IS_PROTECTED(bp));
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 		/*
 		 * When performing a traversal it is beneficial to
 		 * asynchronously read-ahead the upcoming indirect
 		 * blocks since they will be needed shortly. However,
 		 * since a 128k indirect (non-L0) block may contain up
 		 * to 1024 128-byte block pointers, its preferable to not
 		 * prefetch them all at once. Issuing a large number of
 		 * async reads may effect performance, and the earlier
 		 * the indirect blocks are prefetched the less likely
 		 * they are to still be resident in the ARC when needed.
 		 * Therefore, prefetching indirect blocks is limited to
 		 * zfs_traverse_indirect_prefetch_limit=32 blocks by
 		 * default.
 		 *
 		 * pidx: Index for which next prefetch to be issued.
 		 * ptidx: Index at which next prefetch to be triggered.
 		 */
 		ptidx = 0;
 		pidx = 1;
 		prefetchlimit = zfs_traverse_indirect_prefetch_limit;
 		for (i = 0; i < epb; i++) {
 			if (prefetchlimit && i == ptidx) {
 				ASSERT3S(ptidx, <=, pidx);
 				for (uint32_t  prefetched = 0; pidx < epb &&
 				    prefetched < prefetchlimit; pidx++) {
 					SET_BOOKMARK(czb, zb->zb_objset,
 					    zb->zb_object, zb->zb_level - 1,
 					    zb->zb_blkid * epb + pidx);
 					if (traverse_prefetch_metadata(td, dnp,
 					    &((blkptr_t *)buf->b_data)[pidx],
 					    czb) == B_TRUE) {
 						prefetched++;
 						if (prefetched ==
 						    MAX(prefetchlimit / 2, 1))
 							ptidx = pidx;
 					}
 				}
 			}
 
 			/* recursively visitbp() blocks below this */
 			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp,
 			    &((blkptr_t *)buf->b_data)[i], czb);
 			if (err != 0)
 				break;
 		}
 
 		kmem_free(czb, sizeof (zbookmark_phys_t));
 
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_FLAG_WAIT;
 		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
 		int32_t i;
 		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		dnode_phys_t *child_dnp;
 
 		/*
 		 * dnode blocks might have their bonus buffers encrypted, so
 		 * we must be careful to honor TRAVERSE_NO_DECRYPT
 		 */
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		child_dnp = buf->b_data;
 
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			prefetch_dnode_metadata(td, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			err = traverse_dnode(td, bp, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		osp = buf->b_data;
 		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		/*
 		 * See the block comment above for the goal of this variable.
 		 * If the maxblkid of the meta-dnode is 0, then we know that
 		 * we've never had more than DNODES_PER_BLOCK objects in the
 		 * dataset, which means we can't have reused any object ids.
 		 */
 		if (osp->os_meta_dnode.dn_maxblkid == 0)
 			td->td_realloc_possible = B_FALSE;
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				prefetch_dnode_metadata(td,
 				    &osp->os_projectused_dnode,
 				    zb->zb_objset, DMU_PROJECTUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 
 		err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				err = traverse_dnode(td, bp,
 				    &osp->os_projectused_dnode, zb->zb_objset,
 				    DMU_PROJECTUSED_OBJECT);
 			if (err == 0)
 				err = traverse_dnode(td, bp,
 				    &osp->os_groupused_dnode, zb->zb_objset,
 				    DMU_GROUPUSED_OBJECT);
 			if (err == 0)
 				err = traverse_dnode(td, bp,
 				    &osp->os_userused_dnode, zb->zb_objset,
 				    DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
 		arc_buf_destroy(buf, &buf);
 
 post:
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
 	if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
 		/*
 		 * Ignore this disk error as requested by the HARD flag,
 		 * and continue traversal.
 		 */
 		err = 0;
 	}
 
 	/*
 	 * If we are stopping here, set td_resume.
 	 */
 	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 		td->td_resume->zb_objset = zb->zb_objset;
 		td->td_resume->zb_object = zb->zb_object;
 		td->td_resume->zb_level = 0;
 		/*
 		 * If we have stopped on an indirect block (e.g. due to
 		 * i/o error), we have not visited anything below it.
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
 		 *
 		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
 		 * to dereference it.
 		 */
 		td->td_resume->zb_blkid = zb->zb_blkid;
 		if (zb->zb_level > 0) {
 			td->td_resume->zb_blkid <<= zb->zb_level *
 			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
 		}
 		td->td_paused = B_TRUE;
 	}
 
 	return (err);
 }
 
 static void
 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j;
 	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 }
 
 static int
 traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
 	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
 	    object < td->td_resume->zb_object)
 		return (0);
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 		if (err != 0)
 			break;
 	}
 
 	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 
 	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 	return (err);
 }
 
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	prefetch_data_t *pfd = arg;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_PRESCIENT_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
 	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
 	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 		cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
 	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
 	if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
 		zio_flags |= ZIO_FLAG_RAW;
 
 	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    zio_flags, &aflags, zb);
 
 	return (0);
 }
 
 static void
 traverse_prefetch_thread(void *arg)
 {
 	traverse_data_t *td_main = arg;
 	traverse_data_t td = *td_main;
 	zbookmark_phys_t czb;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 	td.td_resume = &td_main->td_pfd->pd_resume;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
 	td_main->td_pfd->pd_exited = B_TRUE;
 	cv_broadcast(&td_main->td_pfd->pd_cv);
 	mutex_exit(&td_main->td_pfd->pd_mtx);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 static int
 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	traverse_data_t *td;
 	prefetch_data_t *pd;
 	zbookmark_phys_t *czb;
 	int err;
 
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 
 	td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
 	pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
 	czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 	td->td_spa = spa;
 	td->td_objset = objset;
 	td->td_rootbp = rootbp;
 	td->td_min_txg = txg_start;
 	td->td_resume = resume;
 	td->td_func = func;
 	td->td_arg = arg;
 	td->td_pfd = pd;
 	td->td_flags = flags;
 	td->td_paused = B_FALSE;
 	td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 		VERIFY(spa_feature_enabled_txg(spa,
 		    SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
 	} else {
 		td->td_hole_birth_enabled_txg = UINT64_MAX;
 	}
 
 	pd->pd_flags = flags;
 	if (resume != NULL)
 		pd->pd_resume = *resume;
 	mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
 
 	SET_BOOKMARK(czb, td->td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
 	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
 		zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 		uint32_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 		ASSERT(!BP_IS_REDACTED(rootbp));
 
 		if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
 		    BP_IS_PROTECTED(rootbp))
 			zio_flags |= ZIO_FLAG_RAW;
 
 		err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
 		    &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
 		if (err != 0) {
 			/*
 			 * If both TRAVERSE_HARD and TRAVERSE_PRE are set,
 			 * continue to visitbp so that td_func can be called
 			 * in pre stage, and err will reset to zero.
 			 */
 			if (!(td->td_flags & TRAVERSE_HARD) ||
 			    !(td->td_flags & TRAVERSE_PRE))
 				goto out;
 		} else {
 			osp = buf->b_data;
 			traverse_zil(td, &osp->os_zil_header);
 			arc_buf_destroy(buf, &buf);
 		}
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 	    taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,
 	    td, TQ_NOQUEUE) == TASKQID_INVALID)
 		pd->pd_exited = B_TRUE;
 
 	err = traverse_visitbp(td, NULL, rootbp, czb);
 
 	mutex_enter(&pd->pd_mtx);
 	pd->pd_cancel = B_TRUE;
 	cv_broadcast(&pd->pd_cv);
 	while (!pd->pd_exited)
 		cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
 	mutex_exit(&pd->pd_mtx);
 out:
 	mutex_destroy(&pd->pd_mtx);
 	cv_destroy(&pd->pd_cv);
 
 	kmem_free(czb, sizeof (zbookmark_phys_t));
 	kmem_free(pd, sizeof (struct prefetch_data));
 	kmem_free(td, sizeof (struct traverse_data));
 
 	return (err);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 int
 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
     zbookmark_phys_t *resume,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
 }
 
 int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 }
 
 int
 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
 	int err;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 	    txg_start, NULL, flags, func, arg);
 	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
 	for (uint64_t obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
 			if (hard)
 				continue;
 			break;
 		}
 
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
 			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
 				if (hard)
 					continue;
 				break;
 			}
 			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err != 0)
 				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
 	return (err);
 }
 
 EXPORT_SYMBOL(traverse_dataset);
 EXPORT_SYMBOL(traverse_pool);
 
 ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
 	"Max number of bytes to prefetch");
 
 ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, UINT, ZMOD_RW,
 	"Traverse prefetch number of blocks pointed by indirect block");
 
 #if defined(_KERNEL)
 module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644);
 MODULE_PARM_DESC(ignore_hole_birth,
 	"Alias for send_holes_without_birth_time");
 #endif
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
 	"Ignore hole_birth txg for zfs send");
diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c
index 8788ba11aea9..71f151b14d9b 100644
--- a/module/zfs/dsl_dir.c
+++ b/module/zfs/dsl_dir.c
@@ -1,2499 +1,2498 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
 #include <sys/zthr.h>
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 
 /*
  * This controls if we verify the ZVOL quota or not.
  * Currently, quotas are not implemented for ZVOLs.
  * The quota size is the size of the ZVOL.
  * The size of the volume already implies the ZVOL size quota.
  * The quota mechanism can introduce a significant performance drop.
  */
 static int zvol_enforce_quotas = B_TRUE;
 
 /*
  * Filesystem and Snapshot Limits
  * ------------------------------
  *
  * These limits are used to restrict the number of filesystems and/or snapshots
  * that can be created at a given level in the tree or below. A typical
  * use-case is with a delegated dataset where the administrator wants to ensure
  * that a user within the zone is not creating too many additional filesystems
  * or snapshots, even though they're not exceeding their space quota.
  *
  * The filesystem and snapshot counts are stored as extensible properties. This
  * capability is controlled by a feature flag and must be enabled to be used.
  * Once enabled, the feature is not active until the first limit is set. At
  * that point, future operations to create/destroy filesystems or snapshots
  * will validate and update the counts.
  *
  * Because the count properties will not exist before the feature is active,
  * the counts are updated when a limit is first set on an uninitialized
  * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
  * all of the nested filesystems/snapshots. Thus, a new leaf node has a
  * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
  * snapshot count properties on a node indicate uninitialized counts on that
  * node.) When first setting a limit on an uninitialized node, the code starts
  * at the filesystem with the new limit and descends into all sub-filesystems
  * to add the count properties.
  *
  * In practice this is lightweight since a limit is typically set when the
  * filesystem is created and thus has no children. Once valid, changing the
  * limit value won't require a re-traversal since the counts are already valid.
  * When recursively fixing the counts, if a node with a limit is encountered
  * during the descent, the counts are known to be valid and there is no need to
  * descend into that filesystem's children. The counts on filesystems above the
  * one with the new limit will still be uninitialized, unless a limit is
  * eventually set on one of those filesystems. The counts are always recursively
  * updated when a limit is set on a dataset, unless there is already a limit.
  * When a new limit value is set on a filesystem with an existing limit, it is
  * possible for the new limit to be less than the current count at that level
  * since a user who can change the limit is also allowed to exceed the limit.
  *
  * Once the feature is active, then whenever a filesystem or snapshot is
  * created, the code recurses up the tree, validating the new count against the
  * limit at each initialized level. In practice, most levels will not have a
  * limit set. If there is a limit at any initialized level up the tree, the
  * check must pass or the creation will fail. Likewise, when a filesystem or
  * snapshot is destroyed, the counts are recursively adjusted all the way up
  * the initialized nodes in the tree. Renaming a filesystem into different point
  * in the tree will first validate, then update the counts on each branch up to
  * the common ancestor. A receive will also validate the counts and then update
  * them.
  *
  * An exception to the above behavior is that the limit is not enforced if the
  * user has permission to modify the limit. This is primarily so that
  * recursive snapshots in the global zone always work. We want to prevent a
  * denial-of-service in which a lower level delegated dataset could max out its
  * limit and thus block recursive snapshots from being taken in the global zone.
  * Because of this, it is possible for the snapshot count to be over the limit
  * and snapshots taken in the global zone could cause a lower level dataset to
  * hit or exceed its limit. The administrator taking the global zone recursive
  * snapshot should be aware of this side-effect and behave accordingly.
  * For consistency, the filesystem limit is also not enforced if the user can
  * modify the limit.
  *
  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
  * dsl_dir_init_fs_ss_count().
  */
 
 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
 typedef struct ddulrt_arg {
 	dsl_dir_t	*ddulrta_dd;
 	uint64_t	ddlrta_txg;
 } ddulrt_arg_t;
 
 static void
 dsl_dir_evict_async(void *dbu)
 {
 	dsl_dir_t *dd = dbu;
 	int t;
 	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
 
 	dd->dd_dbuf = NULL;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 		ASSERT(dd->dd_tempreserved[t] == 0);
 		ASSERT(dd->dd_space_towrite[t] == 0);
 	}
 
 	if (dd->dd_parent)
 		dsl_dir_async_rele(dd->dd_parent, dd);
 
 	spa_async_close(dd->dd_pool->dp_spa, dd);
 
 	if (dsl_deadlist_is_open(&dd->dd_livelist))
 		dsl_dir_livelist_close(dd);
 
 	dsl_prop_fini(dd);
 	cv_destroy(&dd->dd_activity_cv);
 	mutex_destroy(&dd->dd_activity_lock);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
 int
 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, const void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
 	dmu_object_info_t doi;
 	int err;
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
 	if (err != 0)
 		return (err);
 	dd = dmu_buf_get_user(dbuf);
 
 	dmu_object_info_from_db(dbuf, &doi);
 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
 	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 
 	if (dd == NULL) {
 		dsl_dir_t *winner;
 
 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 		dd->dd_object = ddobj;
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
 
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
 		dsl_prop_init(dd);
 
 		if (dsl_dir_is_zapified(dd)) {
 			err = zap_lookup(dp->dp_meta_objset,
 			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
 			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
 			if (err == 0) {
 				/* check for on-disk format errata */
 				if (dsl_dir_incompatible_encryption_version(
 				    dd)) {
 					dp->dp_spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
 				}
 			} else if (err != ENOENT) {
 				goto errout;
 			}
 		}
 
 		if (dsl_dir_phys(dd)->dd_parent_obj) {
 			err = dsl_dir_hold_obj(dp,
 			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
 			    &dd->dd_parent);
 			if (err != 0)
 				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
 				    dsl_dir_phys(dd->dd_parent)->
 				    dd_child_dir_zapobj, tail,
 				    sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strlcpy(dd->dd_myname, tail,
 				    sizeof (dd->dd_myname));
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
 				    dsl_dir_phys(dd->dd_parent)->
 				    dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname,
 				    sizeof (dd->dd_myname));
 			}
 			if (err != 0)
 				goto errout;
 		} else {
 			(void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
 			    sizeof (dd->dd_myname));
 		}
 
 		if (dsl_dir_is_clone(dd)) {
 			dmu_buf_t *origin_bonus;
 			dsl_dataset_phys_t *origin_phys;
 
 			/*
 			 * We can't open the origin dataset, because
 			 * that would require opening this dsl_dir.
 			 * Just look at its phys directly instead.
 			 */
 			err = dmu_bonus_hold(dp->dp_meta_objset,
 			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
 			    &origin_bonus);
 			if (err != 0)
 				goto errout;
 			origin_phys = origin_bonus->db_data;
 			dd->dd_origin_txg =
 			    origin_phys->ds_creation_txg;
 			dmu_buf_rele(origin_bonus, FTAG);
 			if (dsl_dir_is_zapified(dd)) {
 				uint64_t obj;
 				err = zap_lookup(dp->dp_meta_objset,
 				    dd->dd_object, DD_FIELD_LIVELIST,
 				    sizeof (uint64_t), 1, &obj);
 				if (err == 0) {
 					err = dsl_dir_livelist_open(dd, obj);
 					if (err != 0)
 						goto errout;
 				} else if (err != ENOENT)
 					goto errout;
 			}
 		}
 
 		if (dsl_dir_is_zapified(dd)) {
 			inode_timespec_t t = {0};
 			(void) zap_lookup(dp->dp_meta_objset, ddobj,
 			    DD_FIELD_SNAPSHOTS_CHANGED,
 			    sizeof (uint64_t),
 			    sizeof (inode_timespec_t) / sizeof (uint64_t),
 			    &t);
 			dd->dd_snap_cmtime = t;
 		}
 
 		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
 		    &dd->dd_dbuf);
 		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
 		if (winner != NULL) {
 			if (dd->dd_parent)
 				dsl_dir_rele(dd->dd_parent, dd);
 			if (dsl_deadlist_is_open(&dd->dd_livelist))
 				dsl_dir_livelist_close(dd);
 			dsl_prop_fini(dd);
 			cv_destroy(&dd->dd_activity_cv);
 			mutex_destroy(&dd->dd_activity_lock);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
 		} else {
 			spa_open_ref(dp->dp_spa, dd);
 		}
 	}
 
 	/*
 	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
 	 * holds on the spa.  We need the open-to-close holds because
 	 * otherwise the spa_refcnt wouldn't change when we open a
 	 * dir which the spa also has open, so we could incorrectly
 	 * think it was OK to unload/export/destroy the pool.  We need
 	 * the instantiate-to-evict hold because the dsl_dir_t has a
 	 * pointer to the dd_pool, which has a pointer to the spa_t.
 	 */
 	spa_open_ref(dp->dp_spa, tag);
 	ASSERT3P(dd->dd_pool, ==, dp);
 	ASSERT3U(dd->dd_object, ==, ddobj);
 	ASSERT3P(dd->dd_dbuf, ==, dbuf);
 	*ddp = dd;
 	return (0);
 
 errout:
 	if (dd->dd_parent)
 		dsl_dir_rele(dd->dd_parent, dd);
 	if (dsl_deadlist_is_open(&dd->dd_livelist))
 		dsl_dir_livelist_close(dd);
 	dsl_prop_fini(dd);
 	cv_destroy(&dd->dd_activity_cv);
 	mutex_destroy(&dd->dd_activity_lock);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
 	return (err);
 }
 
 void
 dsl_dir_rele(dsl_dir_t *dd, const void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /*
  * Remove a reference to the given dsl dir that is being asynchronously
  * released.  Async releases occur from a taskq performing eviction of
  * dsl datasets and dirs.  This process is identical to a normal release
  * with the exception of using the async API for releasing the reference on
  * the spa.
  */
 void
 dsl_dir_async_rele(dsl_dir_t *dd, const void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_async_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
 /* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
 void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
 {
 	if (dd->dd_parent) {
 		dsl_dir_name(dd->dd_parent, buf);
 		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
 		    ZFS_MAX_DATASET_NAME_LEN);
 	} else {
 		buf[0] = '\0';
 	}
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/*
 		 * recursive mutex so that we can use
 		 * dprintf_dd() with dd_lock held
 		 */
 		mutex_enter(&dd->dd_lock);
 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
 		    <, ZFS_MAX_DATASET_NAME_LEN);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
 		    <, ZFS_MAX_DATASET_NAME_LEN);
 	}
 }
 
 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
 	int result = 0;
 
 	if (dd->dd_parent) {
 		/* parent's name + 1 for the "/" */
 		result = dsl_dir_namelen(dd->dd_parent) + 1;
 	}
 
 	if (!MUTEX_HELD(&dd->dd_lock)) {
 		/* see dsl_dir_name */
 		mutex_enter(&dd->dd_lock);
 		result += strlen(dd->dd_myname);
 		mutex_exit(&dd->dd_lock);
 	} else {
 		result += strlen(dd->dd_myname);
 	}
 
 	return (result);
 }
 
 static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
 
 	if ((path == NULL) || (path[0] == '\0'))
 		return (SET_ERROR(ENOENT));
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
 	if (p && (p[1] == '/' || p[1] == '@')) {
 		/* two separators in a row */
 		return (SET_ERROR(EINVAL));
 	}
 	if (p == NULL || p == path) {
 		/*
 		 * if the first thing is an @ or /, it had better be an
 		 * @ and it had better not have any more ats or slashes,
 		 * and it had better have something after the @.
 		 */
 		if (p != NULL &&
 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 			return (SET_ERROR(EINVAL));
 		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
 		p = NULL;
 	} else if (p[0] == '/') {
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strlcpy(component, path, p - path + 1);
 		p++;
 	} else if (p[0] == '@') {
 		/*
 		 * if the next separator is an @, there better not be
 		 * any more slashes.
 		 */
 		if (strchr(path, '/'))
 			return (SET_ERROR(EINVAL));
 		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
 			return (SET_ERROR(ENAMETOOLONG));
 		(void) strlcpy(component, path, p - path + 1);
 	} else {
 		panic("invalid p=%p", (void *)p);
 	}
 	*nextp = p;
 	return (0);
 }
 
 /*
  * Return the dsl_dir_t, and possibly the last component which couldn't
  * be found in *tail.  The name must be in the specified dsl_pool_t.  This
  * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
  * path is bogus, or if tail==NULL and we couldn't parse the whole name.
  * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
 dsl_dir_hold(dsl_pool_t *dp, const char *name, const void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
 	char *buf;
 	const char *spaname, *next, *nextnext = NULL;
 	int err;
 	dsl_dir_t *dd;
 	uint64_t ddobj;
 
 	buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	err = getcomponent(name, buf, &next);
 	if (err != 0)
 		goto error;
 
 	/* Make sure the name is in the specified pool. */
 	spaname = spa_name(dp->dp_spa);
 	if (strcmp(buf, spaname) != 0) {
 		err = SET_ERROR(EXDEV);
 		goto error;
 	}
 
 	ASSERT(dsl_pool_config_held(dp));
 
 	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 	if (err != 0) {
 		goto error;
 	}
 
 	while (next != NULL) {
 		dsl_dir_t *child_dd;
 		err = getcomponent(next, buf, &nextnext);
 		if (err != 0)
 			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
 		dprintf("looking up %s in obj%lld\n",
 		    buf, (longlong_t)dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
 		err = zap_lookup(dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
 		if (err != 0) {
 			if (err == ENOENT)
 				err = 0;
 			break;
 		}
 
 		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
 		if (err != 0)
 			break;
 		dsl_dir_rele(dd, tag);
 		dd = child_dd;
 		next = nextnext;
 	}
 
 	if (err != 0) {
 		dsl_dir_rele(dd, tag);
 		goto error;
 	}
 
 	/*
 	 * It's an error if there's more than one component left, or
 	 * tailp==NULL and there's any component left.
 	 */
 	if (next != NULL &&
 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 		/* bad path name */
 		dsl_dir_rele(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 		err = SET_ERROR(ENOENT);
 	}
 	if (tailp != NULL)
 		*tailp = next;
 	if (err == 0)
 		*ddp = dd;
 error:
 	kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
 	return (err);
 }
 
 /*
  * If the counts are already initialized for this filesystem and its
  * descendants then do nothing, otherwise initialize the counts.
  *
  * The counts on this filesystem, and those below, may be uninitialized due to
  * either the use of a pre-existing pool which did not support the
  * filesystem/snapshot limit feature, or one in which the feature had not yet
  * been enabled.
  *
  * Recursively descend the filesystem tree and update the filesystem/snapshot
  * counts on each filesystem below, then update the cumulative count on the
  * current filesystem. If the filesystem already has a count set on it,
  * then we know that its counts, and the counts on the filesystems below it,
  * are already correct, so we don't have to update this filesystem.
  */
 static void
 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	uint64_t my_fs_cnt = 0;
 	uint64_t my_ss_cnt = 0;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *os = dp->dp_meta_objset;
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 	dsl_dataset_t *ds;
 
 	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
 	ASSERT(dsl_pool_config_held(dp));
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsl_dir_zapify(dd, tx);
 
 	/*
 	 * If the filesystem count has already been initialized then we
 	 * don't need to recurse down any further.
 	 */
 	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
 		return;
 
 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = zap_attribute_alloc();
 
 	/* Iterate my child dirs */
 	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
 		dsl_dir_t *chld_dd;
 		uint64_t count;
 
 		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
 		    &chld_dd));
 
 		/*
 		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
 		 */
 		if (chld_dd->dd_myname[0] == '$') {
 			dsl_dir_rele(chld_dd, FTAG);
 			continue;
 		}
 
 		my_fs_cnt++;	/* count this child */
 
 		dsl_dir_init_fs_ss_count(chld_dd, tx);
 
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
 		my_fs_cnt += count;
 		VERIFY0(zap_lookup(os, chld_dd->dd_object,
 		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
 		my_ss_cnt += count;
 
 		dsl_dir_rele(chld_dd, FTAG);
 	}
 	zap_cursor_fini(zc);
 	/* Count my snapshots (we counted children's snapshots above) */
 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
 
 	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		/* Don't count temporary snapshots */
 		if (za->za_name[0] != '%')
 			my_ss_cnt++;
 	}
 	zap_cursor_fini(zc);
 
 	dsl_dataset_rele(ds, FTAG);
 
 	kmem_free(zc, sizeof (zap_cursor_t));
 	zap_attribute_free(za);
 
 	/* we're in a sync task, update counts */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
 	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
 }
 
 static int
 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	dd = ds->ds_dir;
 	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
 	    dsl_dir_is_zapified(dd) &&
 	    zap_contains(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (SET_ERROR(EALREADY));
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
 {
 	char *ddname = (char *)arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	spa_t *spa;
 
 	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
 
 	spa = dsl_dataset_get_spa(ds);
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
 		/*
 		 * Since the feature was not active and we're now setting a
 		 * limit, increment the feature-active counter so that the
 		 * feature becomes active for the first time.
 		 *
 		 * We are already in a sync task so we can update the MOS.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
 	}
 
 	/*
 	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
 	 * we need to ensure the counts are correct. Descend down the tree from
 	 * this point and update all of the counts to be accurate.
 	 */
 	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * Make sure the feature is enabled and activate it if necessary.
  * Since we're setting a limit, ensure the on-disk counts are valid.
  * This is only called by the ioctl path when setting a limit value.
  *
  * We do not need to validate the new limit, since users who can change the
  * limit are also allowed to exceed the limit.
  */
 int
 dsl_dir_activate_fs_ss_limit(const char *ddname)
 {
 	int error;
 
 	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
 	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
 	    ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == EALREADY)
 		error = 0;
 
 	return (error);
 }
 
 /*
  * Used to determine if the filesystem_limit or snapshot_limit should be
  * enforced. We allow the limit to be exceeded if the user has permission to
  * write the property value. We pass in the creds that we got in the open
  * context since we will always be the GZ root in syncing context. We also have
  * to handle the case where we are allowed to change the limit on the current
  * dataset, but there may be another limit in the tree above.
  *
  * We can never modify these two properties within a non-global zone. In
  * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
  * can't use that function since we are already holding the dp_config_rwlock.
  * In addition, we already have the dd and dealing with snapshots is simplified
  * in this code.
  */
 
 typedef enum {
 	ENFORCE_ALWAYS,
 	ENFORCE_NEVER,
 	ENFORCE_ABOVE
 } enforce_res_t;
 
 static enforce_res_t
 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
     cred_t *cr, proc_t *proc)
 {
 	enforce_res_t enforce = ENFORCE_ALWAYS;
 	uint64_t obj;
 	dsl_dataset_t *ds;
 	uint64_t zoned;
 	const char *zonedstr;
 
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 #ifdef _KERNEL
 	if (crgetzoneid(cr) != GLOBAL_ZONEID)
 		return (ENFORCE_ALWAYS);
 
 	/*
 	 * We are checking the saved credentials of the user process, which is
 	 * not the current process.  Note that we can't use secpolicy_zfs(),
 	 * because it only works if the cred is that of the current process (on
 	 * Linux).
 	 */
 	if (secpolicy_zfs_proc(cr, proc) == 0)
 		return (ENFORCE_NEVER);
 #else
 	(void) proc;
 #endif
 
 	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
 		return (ENFORCE_ALWAYS);
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
 	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
 		return (ENFORCE_ALWAYS);
 
 	zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
 	if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
 		/* Only root can access zoned fs's from the GZ */
 		enforce = ENFORCE_ALWAYS;
 	} else {
 		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
 			enforce = ENFORCE_ABOVE;
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (enforce);
 }
 
 /*
  * Check if adding additional child filesystem(s) would exceed any filesystem
  * limits or adding additional snapshot(s) would exceed any snapshot limits.
  * The prop argument indicates which limit to check.
  *
  * Note that all filesystem limits up to the root (or the highest
  * initialized) filesystem or the given ancestor must be satisfied.
  */
 int
 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
     dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
 {
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t limit, count;
 	const char *count_prop;
 	enforce_res_t enforce;
 	int err = 0;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
 	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
 
 	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
 		/*
 		 * We don't enforce the limit for temporary snapshots. This is
 		 * indicated by a NULL cred_t argument.
 		 */
 		if (cr == NULL)
 			return (0);
 
 		count_prop = DD_FIELD_SNAPSHOT_COUNT;
 	} else {
 		count_prop = DD_FIELD_FILESYSTEM_COUNT;
 	}
 	/*
 	 * If we're allowed to change the limit, don't enforce the limit
 	 * e.g. this can happen if a snapshot is taken by an administrative
 	 * user in the global zone (i.e. a recursive snapshot by root).
 	 * However, we must handle the case of delegated permissions where we
 	 * are allowed to change the limit on the current dataset, but there
 	 * is another limit in the tree above.
 	 */
 	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
 	if (enforce == ENFORCE_NEVER)
 		return (0);
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment
 	 * is 0.
 	 */
 	if (delta == 0)
 		return (0);
 
 	/*
 	 * If an ancestor has been provided, stop checking the limit once we
 	 * hit that dir. We need this during rename so that we don't overcount
 	 * the check once we recurse up to the common ancestor.
 	 */
 	if (ancestor == dd)
 		return (0);
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know there is no limit here (or above). The counts are
 	 * not valid on this node and we know we won't touch this node's counts.
 	 */
 	if (!dsl_dir_is_zapified(dd))
 		return (0);
 	err = zap_lookup(os, dd->dd_object,
 	    count_prop, sizeof (count), 1, &count);
 	if (err == ENOENT)
 		return (0);
 	if (err != 0)
 		return (err);
 
 	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
 	    B_FALSE);
 	if (err != 0)
 		return (err);
 
 	/* Is there a limit which we've hit? */
 	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
 		return (SET_ERROR(EDQUOT));
 
 	if (dd->dd_parent != NULL)
 		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
 		    ancestor, cr, proc);
 
 	return (err);
 }
 
 /*
  * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
  * parents. When a new filesystem/snapshot is created, increment the count on
  * all parents, and when a filesystem/snapshot is destroyed, decrement the
  * count.
  */
 void
 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
     dmu_tx_t *tx)
 {
 	int err;
 	objset_t *os = dd->dd_pool->dp_meta_objset;
 	uint64_t count;
 
 	ASSERT(dsl_pool_config_held(dd->dd_pool));
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
 	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
 
 	/*
 	 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
 	 */
 	if (dd->dd_myname[0] == '$' && strcmp(prop,
 	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
 		return;
 	}
 
 	/*
 	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
 	 */
 	if (delta == 0)
 		return;
 
 	/*
 	 * If we hit an uninitialized node while recursing up the tree, we can
 	 * stop since we know the counts are not valid on this node and we
 	 * know we shouldn't touch this node's counts. An uninitialized count
 	 * on the node indicates that either the feature has not yet been
 	 * activated or there are no limits on this part of the tree.
 	 */
 	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
 	    prop, sizeof (count), 1, &count)) == ENOENT)
 		return;
 	VERIFY0(err);
 
 	count += delta;
 	/* Use a signed verify to make sure we're not neg. */
 	VERIFY3S(count, >=, 0);
 
 	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
 	    tx));
 
 	/* Roll up this additional count into our ancestors */
 	if (dd->dd_parent != NULL)
 		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
 }
 
 uint64_t
 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
     dmu_tx_t *tx)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
 	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
 		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 	}
 	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	ddphys = dbuf->db_data;
 
 	ddphys->dd_creation_time = gethrestime_sec();
 	if (pds) {
 		ddphys->dd_parent_obj = pds->dd_object;
 
 		/* update the filesystem counts */
 		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
 	}
 	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
 }
 
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_origin_obj &&
 	    (dd->dd_pool->dp_origin_snap == NULL ||
 	    dsl_dir_phys(dd)->dd_origin_obj !=
 	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
 uint64_t
 dsl_dir_get_used(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_bytes);
 }
 
 uint64_t
 dsl_dir_get_compressed(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_compressed_bytes);
 }
 
 uint64_t
 dsl_dir_get_quota(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_quota);
 }
 
 uint64_t
 dsl_dir_get_reservation(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_reserved);
 }
 
 uint64_t
 dsl_dir_get_compressratio(dsl_dir_t *dd)
 {
 	/* a fixed point number, 100x the ratio */
 	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
 	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
 	    dsl_dir_phys(dd)->dd_compressed_bytes));
 }
 
 uint64_t
 dsl_dir_get_logicalused(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
 }
 
 uint64_t
 dsl_dir_get_usedsnap(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
 }
 
 uint64_t
 dsl_dir_get_usedds(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
 }
 
 uint64_t
 dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
 }
 
 uint64_t
 dsl_dir_get_usedchild(dsl_dir_t *dd)
 {
 	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
 	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 }
 
 void
 dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
 {
 	dsl_dataset_t *ds;
 	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
 
 	dsl_dataset_name(ds, buf);
 
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
 {
 	if (dsl_dir_is_zapified(dd)) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
 		    sizeof (*count), 1, count));
 	} else {
 		return (SET_ERROR(ENOENT));
 	}
 }
 
 int
 dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
 {
 	if (dsl_dir_is_zapified(dd)) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
 		    sizeof (*count), 1, count));
 	} else {
 		return (SET_ERROR(ENOENT));
 	}
 }
 
 void
 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 {
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
 	    dsl_dir_get_quota(dd));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 	    dsl_dir_get_reservation(dd));
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
 	    dsl_dir_get_logicalused(dd));
 	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 		    dsl_dir_get_usedsnap(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 		    dsl_dir_get_usedds(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 		    dsl_dir_get_usedrefreserv(dd));
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 		    dsl_dir_get_usedchild(dd));
 	}
 	mutex_exit(&dd->dd_lock);
 
 	uint64_t count;
 	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
 		    count);
 	}
 	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
 		    count);
 	}
 
 	if (dsl_dir_is_clone(dd)) {
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dir_get_origin(dd, buf);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 	}
 
 }
 
 void
 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
 
 	ASSERT(dsl_dir_phys(dd));
 
 	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(dd->dd_dbuf, dd);
 	}
 }
 
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
 	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
 	uint64_t new_accounted =
 	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
 	return (new_accounted - old_accounted);
 }
 
 void
 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	mutex_enter(&dd->dd_lock);
 	ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", (u_longlong_t)tx->tx_txg,
 	    (u_longlong_t)dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
 	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
 	mutex_exit(&dd->dd_lock);
 
 	/* release the hold from dsl_dir_dirty */
 	dmu_buf_rele(dd->dd_dbuf, dd);
 }
 
 static uint64_t
 dsl_dir_space_towrite(dsl_dir_t *dd)
 {
 	uint64_t space = 0;
 
 	ASSERT(MUTEX_HELD(&dd->dd_lock));
 
 	for (int i = 0; i < TXG_SIZE; i++)
 		space += dd->dd_space_towrite[i & TXG_MASK];
 
 	return (space);
 }
 
 /*
  * How much space would dd have available if ancestor had delta applied
  * to it?  If ondiskonly is set, we're only interested in what's
  * on-disk, not estimated pending changes.
  */
 uint64_t
 dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 {
 	uint64_t parentspace, myspace, quota, used;
 
 	/*
 	 * If there are no restrictions otherwise, assume we have
 	 * unlimited space available.
 	 */
 	quota = UINT64_MAX;
 	parentspace = UINT64_MAX;
 
 	if (dd->dd_parent != NULL) {
 		parentspace = dsl_dir_space_available(dd->dd_parent,
 		    ancestor, delta, ondiskonly);
 	}
 
 	mutex_enter(&dd->dd_lock);
 	if (dsl_dir_phys(dd)->dd_quota != 0)
 		quota = dsl_dir_phys(dd)->dd_quota;
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	if (!ondiskonly)
 		used += dsl_dir_space_towrite(dd);
 
 	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
 		    ZFS_SPACE_CHECK_NORMAL);
 		quota = MIN(quota, poolsize);
 	}
 
 	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
 		/*
 		 * We have some space reserved, in addition to what our
 		 * parent gave us.
 		 */
 		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
 	}
 
 	if (dd == ancestor) {
 		ASSERT(delta <= 0);
 		ASSERT(used >= -delta);
 		used += delta;
 		if (parentspace != UINT64_MAX)
 			parentspace -= delta;
 	}
 
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
 		 * the space left in our quota
 		 */
 		myspace = MIN(parentspace, quota - used);
 	}
 
 	mutex_exit(&dd->dd_lock);
 
 	return (myspace);
 }
 
 struct tempreserve {
 	list_node_t tr_node;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
 
 static int
 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
     boolean_t ignorequota, list_t *tr_list,
     dmu_tx_t *tx, boolean_t first)
 {
 	uint64_t txg;
 	uint64_t quota;
 	struct tempreserve *tr;
 	int retval;
 	uint64_t ext_quota;
 	uint64_t ref_rsrv;
 
 top_of_function:
 	txg = tx->tx_txg;
 	retval = EDQUOT;
 	ref_rsrv = 0;
 
 	ASSERT3U(txg, !=, 0);
 	ASSERT3S(asize, >, 0);
 
 	mutex_enter(&dd->dd_lock);
 
 	/*
 	 * Check against the dsl_dir's quota.  We don't add in the delta
 	 * when checking for over-quota because they get one free hit.
 	 */
 	uint64_t est_inflight = dsl_dir_space_towrite(dd);
 	for (int i = 0; i < TXG_SIZE; i++)
 		est_inflight += dd->dd_tempreserved[i];
 	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
 	/*
 	 * On the first iteration, fetch the dataset's used-on-disk and
 	 * refreservation values. Also, if checkrefquota is set, test if
 	 * allocating this space would exceed the dataset's refquota.
 	 */
 	if (first && tx->tx_objset) {
 		int error;
 		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 
 		error = dsl_dataset_check_quota(ds, !netfree,
 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
 		if (error != 0) {
 			mutex_exit(&dd->dd_lock);
 			DMU_TX_STAT_BUMP(dmu_tx_quota);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
 	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0 ||
 	    (tx->tx_objset && dmu_objset_type(tx->tx_objset) == DMU_OST_ZVOL &&
 	    zvol_enforce_quotas == B_FALSE))
 		quota = UINT64_MAX;
 	else
 		quota = dsl_dir_phys(dd)->dd_quota;
 
 	/*
 	 * Adjust the quota against the actual pool size at the root
 	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
 	 * but still within the pool's allocation slop.  In cases where
 	 * we're very close to full, this will allow a steady trickle of
 	 * removes to get through.
 	 */
 	if (dd->dd_parent == NULL) {
 		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
 		    (netfree) ?
 		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
 
 		if (avail < quota) {
 			quota = avail;
 			retval = SET_ERROR(ENOSPC);
 		}
 	}
 
 	/*
 	 * If they are requesting more space, and our current estimate
 	 * is over quota, they get to try again unless the actual
 	 * on-disk is over quota and there are no pending changes
 	 * or deferred frees (which may free up space for us).
 	 */
 	ext_quota = quota >> 5;
 	if (quota == UINT64_MAX)
 		ext_quota = 0;
 
 	if (used_on_disk >= quota) {
 		if (retval == ENOSPC && (used_on_disk - quota) <
 		    dsl_pool_deferred_space(dd->dd_pool)) {
 			retval = SET_ERROR(ERESTART);
 		}
 		/* Quota exceeded */
 		mutex_exit(&dd->dd_lock);
 		DMU_TX_STAT_BUMP(dmu_tx_quota);
 		return (retval);
 	} else if (used_on_disk + est_inflight >= quota + ext_quota) {
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK\n",
 		    (u_longlong_t)used_on_disk>>10,
 		    (u_longlong_t)est_inflight>>10,
 		    (u_longlong_t)quota>>10, (u_longlong_t)asize>>10);
 		mutex_exit(&dd->dd_lock);
 		DMU_TX_STAT_BUMP(dmu_tx_quota);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
 	dd->dd_tempreserved[txg & TXG_MASK] += asize;
 
 	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 	    asize - ref_rsrv);
 	mutex_exit(&dd->dd_lock);
 
 	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 	tr->tr_ds = dd;
 	tr->tr_size = asize;
 	list_insert_tail(tr_list, tr);
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent != NULL && parent_rsrv != 0) {
 		/*
 		 * Recurse on our parent without recursion. This has been
 		 * observed to be potentially large stack usage even within
 		 * the test suite. Largest seen stack was 7632 bytes on linux.
 		 */
 
 		dd = dd->dd_parent;
 		asize = parent_rsrv;
 		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 		first = B_FALSE;
 		goto top_of_function;
 	}
 
 	return (0);
 }
 
 /*
  * Reserve space in this dsl_dir, to be used in this tx's txg.
  * After the space has been dirtied (and dsl_dir_willuse_space()
  * has been called), the reservation should be canceled, using
  * dsl_dir_tempreserve_clear().
  */
 int
 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
     boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
 {
 	int err;
 	list_t *tr_list;
 
 	if (asize == 0) {
 		*tr_cookiep = NULL;
 		return (0);
 	}
 
 	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 	list_create(tr_list, sizeof (struct tempreserve),
 	    offsetof(struct tempreserve, tr_node));
 	ASSERT3S(asize, >, 0);
 
 	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
 	if (err == 0) {
 		struct tempreserve *tr;
 
 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 		tr->tr_size = lsize;
 		list_insert_tail(tr_list, tr);
 	} else {
 		if (err == EAGAIN) {
 			/*
 			 * If arc_memory_throttle() detected that pageout
 			 * is running and we are low on memory, we delay new
 			 * non-pageout transactions to give pageout an
 			 * advantage.
 			 *
 			 * It is unfortunate to be delaying while the caller's
 			 * locks are held.
 			 */
 			txg_delay(dd->dd_pool, tx->tx_txg,
 			    MSEC2NSEC(10), MSEC2NSEC(10));
 			err = SET_ERROR(ERESTART);
 		}
 	}
 
 	if (err == 0) {
 		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
 		    B_FALSE, tr_list, tx, B_TRUE);
 	}
 
 	if (err != 0)
 		dsl_dir_tempreserve_clear(tr_list, tx);
 	else
 		*tr_cookiep = tr_list;
 
 	return (err);
 }
 
 /*
  * Clear a temporary reservation that we previously made with
  * dsl_dir_tempreserve_space().
  */
 void
 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
 {
 	int txgidx = tx->tx_txg & TXG_MASK;
 	list_t *tr_list = tr_cookie;
 	struct tempreserve *tr;
 
 	ASSERT3U(tx->tx_txg, !=, 0);
 
 	if (tr_cookie == NULL)
 		return;
 
 	while ((tr = list_remove_head(tr_list)) != NULL) {
 		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
 			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 			mutex_exit(&tr->tr_ds->dd_lock);
 		} else {
 			arc_tempreserve_clear(tr->tr_size);
 		}
 		kmem_free(tr, sizeof (struct tempreserve));
 	}
 
 	kmem_free(tr_list, sizeof (list_t));
 }
 
 /*
  * This should be called from open context when we think we're going to write
  * or free space, for example when dirtying data. Be conservative; it's okay
  * to write less space or free more, but we don't want to write more or free
  * less than the amount specified.
  *
  * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
  * version however it has been adjusted to use an iterative rather than
  * recursive algorithm to minimize stack usage.
  */
 void
 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
 
 	do {
 		mutex_enter(&dd->dd_lock);
 		if (space > 0)
 			dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
 		est_used = dsl_dir_space_towrite(dd) +
 		    dsl_dir_phys(dd)->dd_used_bytes;
 		parent_space = parent_delta(dd, est_used, space);
 		mutex_exit(&dd->dd_lock);
 
 		/* Make sure that we clean up dd_space_to* */
 		dsl_dir_dirty(dd, tx);
 
 		dd = dd->dd_parent;
 		space = parent_space;
 	} while (space && dd);
 }
 
 /* call from syncing context when we actually write/free space for this dd */
 void
 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/*
 	 * dsl_dataset_set_refreservation_sync_impl() calls this with
 	 * dd_lock held, so that it can atomically update
 	 * ds->ds_reserved and the dsl_dir accounting, so that
 	 * dsl_dataset_check_quota() can see dataset and dir accounting
 	 * consistently.
 	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
 	    ddp->dd_uncompressed_bytes >= -uncompressed);
 	ddp->dd_used_bytes += used;
 	ddp->dd_uncompressed_bytes += uncompressed;
 	ddp->dd_compressed_bytes += compressed;
 
 	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(used >= 0 || ddp->dd_used_breakdown[type] >= -used);
 		ddp->dd_used_breakdown[type] += used;
 #ifdef ZFS_DEBUG
 		{
 			dd_used_t t;
 			uint64_t u = 0;
 			for (t = 0; t < DD_USED_NUM; t++)
 				u += ddp->dd_used_breakdown[t];
 			ASSERT3U(u, ==, ddp->dd_used_bytes);
 		}
 #endif
 	}
 	if (needlock)
 		mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
 		dsl_dir_diduse_transfer_space(dd->dd_parent,
 		    accounted_delta, compressed, uncompressed,
 		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
 void
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	if (delta == 0 ||
 	    !(ddp->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
 	    ddp->dd_used_breakdown[oldtype] >= delta :
 	    ddp->dd_used_breakdown[newtype] >= -delta);
 	ASSERT(ddp->dd_used_bytes >= ABS(delta));
 	ddp->dd_used_breakdown[oldtype] -= delta;
 	ddp->dd_used_breakdown[newtype] += delta;
 	mutex_exit(&dd->dd_lock);
 }
 
 void
 dsl_dir_diduse_transfer_space(dsl_dir_t *dd, int64_t used,
     int64_t compressed, int64_t uncompressed, int64_t tonew,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
 	dsl_dir_phys_t *ddp = dsl_dir_phys(dd);
 	accounted_delta = parent_delta(dd, ddp->dd_used_bytes, used);
 	ASSERT(used >= 0 || ddp->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 || ddp->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
 	    ddp->dd_uncompressed_bytes >= -uncompressed);
 	ddp->dd_used_bytes += used;
 	ddp->dd_uncompressed_bytes += uncompressed;
 	ddp->dd_compressed_bytes += compressed;
 
 	if (ddp->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(tonew - used <= 0 ||
 		    ddp->dd_used_breakdown[oldtype] >= tonew - used);
 		ASSERT(tonew >= 0 ||
 		    ddp->dd_used_breakdown[newtype] >= -tonew);
 		ddp->dd_used_breakdown[oldtype] -= tonew - used;
 		ddp->dd_used_breakdown[newtype] += tonew;
 #ifdef ZFS_DEBUG
 		{
 			dd_used_t t;
 			uint64_t u = 0;
 			for (t = 0; t < DD_USED_NUM; t++)
 				u += ddp->dd_used_breakdown[t];
 			ASSERT3U(u, ==, ddp->dd_used_bytes);
 		}
 #endif
 	}
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent != NULL) {
 		dsl_dir_diduse_transfer_space(dd->dd_parent,
 		    accounted_delta, compressed, uncompressed,
 		    used, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 	}
 }
 
 typedef struct dsl_dir_set_qr_arg {
 	const char *ddsqra_name;
 	zprop_source_t ddsqra_source;
 	uint64_t ddsqra_value;
 } dsl_dir_set_qr_arg_t;
 
 static int
 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
 	uint64_t towrite, newval;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	error = dsl_prop_predict(ds->ds_dir, "quota",
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	if (newval == 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
 	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
 	towrite = dsl_dir_space_towrite(ds->ds_dir);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
 	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
 	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
 		error = SET_ERROR(ENOSPC);
 	}
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 static void
 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
 	}
 
 	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 	mutex_enter(&ds->ds_dir->dd_lock);
 	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
 	mutex_exit(&ds->ds_dir->dd_lock);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = quota;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
 	    dsl_dir_set_quota_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static int
 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	uint64_t newval, used, avail;
 	int error;
 
 	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 	if (error != 0)
 		return (error);
 	dd = ds->ds_dir;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
 	if (!dmu_tx_is_syncing(tx)) {
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	error = dsl_prop_predict(ds->ds_dir,
 	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&dd->dd_lock);
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
 		avail = dsl_dir_space_available(dd->dd_parent,
 		    NULL, 0, FALSE);
 	} else {
 		avail = dsl_pool_adjustedsize(dd->dd_pool,
 		    ZFS_SPACE_CHECK_NORMAL) - used;
 	}
 
 	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
 		uint64_t delta = MAX(used, newval) -
 		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
 
 		if (delta > avail ||
 		    (dsl_dir_phys(dd)->dd_quota > 0 &&
 		    newval > dsl_dir_phys(dd)->dd_quota))
 			error = SET_ERROR(ENOSPC);
 	}
 
 	dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
 void
 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 {
 	uint64_t used;
 	int64_t delta;
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
 	used = dsl_dir_phys(dd)->dd_used_bytes;
 	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
 	dsl_dir_phys(dd)->dd_reserved = value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 		    delta, 0, 0, tx);
 	}
 	mutex_exit(&dd->dd_lock);
 }
 
 static void
 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_set_qr_arg_t *ddsqra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t newval;
 
 	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 		dsl_prop_set_sync_impl(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 		    &ddsqra->ddsqra_value, tx);
 
 		VERIFY0(dsl_prop_get_int_ds(ds,
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
 	} else {
 		newval = ddsqra->ddsqra_value;
 		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
 		    (longlong_t)newval);
 	}
 
 	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation)
 {
 	dsl_dir_set_qr_arg_t ddsqra;
 
 	ddsqra.ddsqra_name = ddname;
 	ddsqra.ddsqra_source = source;
 	ddsqra.ddsqra_value = reservation;
 
 	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
 	    dsl_dir_set_reservation_sync, &ddsqra, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static dsl_dir_t *
 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
 {
 	for (; ds1; ds1 = ds1->dd_parent) {
 		dsl_dir_t *dd;
 		for (dd = ds2; dd; dd = dd->dd_parent) {
 			if (ds1 == dd)
 				return (dd);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * If delta is applied to dd, how much of that delta would be applied to
  * ancestor?  Syncing context only.
  */
 static int64_t
 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
 {
 	if (dd == ancestor)
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
 	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
 
 typedef struct dsl_dir_rename_arg {
 	const char *ddra_oldname;
 	const char *ddra_newname;
 	cred_t *ddra_cred;
 	proc_t *ddra_proc;
 } dsl_dir_rename_arg_t;
 
 typedef struct dsl_valid_rename_arg {
 	int char_delta;
 	int nest_delta;
 } dsl_valid_rename_arg_t;
 
 static int
 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) dp;
 	dsl_valid_rename_arg_t *dvra = arg;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	dsl_dataset_name(ds, namebuf);
 
 	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	int namelen = strlen(namebuf) + dvra->char_delta;
 	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
 
 	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
 static int
 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	dsl_valid_rename_arg_t dvra;
 	dsl_dataset_t *parentds;
 	objset_t *parentos;
 	const char *mynewname;
 	int error;
 
 	/* target dir should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	/* new parent should exist */
 	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
 	    &newparent, &mynewname);
 	if (error != 0) {
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 
 	/* can't rename to different pool */
 	if (dd->dd_pool != newparent->dd_pool) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* new name should not already exist */
 	if (mynewname == NULL) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/* can't rename below anything but filesystems (eg. no ZVOLs) */
 	error = dsl_dataset_hold_obj(newparent->dd_pool,
 	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
 	if (error != 0) {
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 	error = dmu_objset_from_ds(parentds, &parentos);
 	if (error != 0) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (error);
 	}
 	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(newparent, FTAG);
 		dsl_dir_rele(dd, FTAG);
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 	}
 	dsl_dataset_rele(parentds, FTAG);
 
 	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
 	    <, ZFS_MAX_DATASET_NAME_LEN);
 	dvra.char_delta = strlen(ddra->ddra_newname)
 	    - strlen(ddra->ddra_oldname);
 	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
 	    - get_dataset_depth(ddra->ddra_oldname);
 
 	/* if the name length is growing, validate child name lengths */
 	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
 		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
 		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	if (dmu_tx_is_syncing(tx)) {
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			/*
 			 * Although this is the check function and we don't
 			 * normally make on-disk changes in check functions,
 			 * we need to do that here.
 			 *
 			 * Ensure this portion of the tree's counts have been
 			 * initialized in case the new parent has limits set.
 			 */
 			dsl_dir_init_fs_ss_count(dd, tx);
 		}
 	}
 
 	if (newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
 		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
 		    dsl_dir_phys(dd)->dd_reserved);
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		if (dsl_dir_is_zapified(dd)) {
 			int err;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 
 			/*
 			 * have to add 1 for the filesystem itself that we're
 			 * moving
 			 */
 			fs_cnt++;
 
 			err = zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt);
 			if (err != ENOENT && err != 0) {
 				dsl_dir_rele(newparent, FTAG);
 				dsl_dir_rele(dd, FTAG);
 				return (err);
 			}
 		}
 
 		/* check for encryption errors */
 		error = dsl_dir_rename_crypt_check(dd, newparent);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(EACCES));
 		}
 
 		/* no rename into our descendant */
 		if (closest_common_ancestor(dd, newparent) == dd) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		error = dsl_dir_transfer_possible(dd->dd_parent,
 		    newparent, fs_cnt, ss_cnt, myspace,
 		    ddra->ddra_cred, ddra->ddra_proc);
 		if (error != 0) {
 			dsl_dir_rele(newparent, FTAG);
 			dsl_dir_rele(dd, FTAG);
 			return (error);
 		}
 	}
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 	return (0);
 }
 
 static void
 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_dir_rename_arg_t *ddra = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd, *newparent;
 	const char *mynewname;
 	objset_t *mos = dp->dp_meta_objset;
 
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
 	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
 	    &mynewname));
 
 	ASSERT3P(mynewname, !=, NULL);
 
 	/* Log this before we change the name. */
 	spa_history_log_internal_dd(dd, "rename", tx,
 	    "-> %s", ddra->ddra_newname);
 
 	if (newparent != dd->dd_parent) {
 		objset_t *os = dd->dd_pool->dp_meta_objset;
 		uint64_t fs_cnt = 0;
 		uint64_t ss_cnt = 0;
 
 		/*
 		 * We already made sure the dd counts were initialized in the
 		 * check function.
 		 */
 		if (spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_FS_SS_LIMIT)) {
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
 			    &fs_cnt));
 			/* add 1 for the filesystem itself that we're moving */
 			fs_cnt++;
 
 			VERIFY0(zap_lookup(os, dd->dd_object,
 			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
 			    &ss_cnt));
 		}
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, fs_cnt,
 		    DD_FIELD_FILESYSTEM_COUNT, tx);
 
 		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 		dsl_fs_ss_count_adjust(newparent, ss_cnt,
 		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 		    -dsl_dir_phys(dd)->dd_used_bytes,
 		    -dsl_dir_phys(dd)->dd_compressed_bytes,
 		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
 		    dsl_dir_phys(dd)->dd_used_bytes,
 		    dsl_dir_phys(dd)->dd_compressed_bytes,
 		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
 
 		if (dsl_dir_phys(dd)->dd_reserved >
 		    dsl_dir_phys(dd)->dd_used_bytes) {
 			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
 			    dsl_dir_phys(dd)->dd_used_bytes;
 
 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 			    -unused_rsrv, 0, 0, tx);
 			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
 			    unused_rsrv, 0, 0, tx);
 		}
 	}
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/* remove from old parent zapobj */
 	VERIFY0(zap_remove(mos,
 	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx));
 
 	(void) strlcpy(dd->dd_myname, mynewname,
 	    sizeof (dd->dd_myname));
 	dsl_dir_rele(dd->dd_parent, dd);
 	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
 	VERIFY0(dsl_dir_hold_obj(dp,
 	    newparent->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
 	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
 	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
 
 	/* TODO: A rename callback to avoid these layering violations. */
 	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
 	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
 	    ddra->ddra_newname, B_TRUE);
 
 	dsl_prop_notify_all(dd);
 
 	dsl_dir_rele(newparent, FTAG);
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 dsl_dir_rename(const char *oldname, const char *newname)
 {
 	dsl_dir_rename_arg_t ddra;
 
 	ddra.ddra_oldname = oldname;
 	ddra.ddra_newname = newname;
 	ddra.ddra_cred = CRED();
 	ddra.ddra_proc = curproc;
 
 	return (dsl_sync_task(oldname,
 	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
 	    3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
     uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
     cred_t *cr, proc_t *proc)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
 	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
 	if (avail < space)
 		return (SET_ERROR(ENOSPC));
 
 	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
 	    ancestor, cr, proc);
 	if (err != 0)
 		return (err);
 	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
 	    ancestor, cr, proc);
 	if (err != 0)
 		return (err);
 
 	return (0);
 }
 
 inode_timespec_t
 dsl_dir_snap_cmtime(dsl_dir_t *dd)
 {
 	inode_timespec_t t;
 
 	mutex_enter(&dd->dd_lock);
 	t = dd->dd_snap_cmtime;
 	mutex_exit(&dd->dd_lock);
 
 	return (t);
 }
 
 void
 dsl_dir_snap_cmtime_update(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	inode_timespec_t t;
 	gethrestime(&t);
 
 	mutex_enter(&dd->dd_lock);
 	dd->dd_snap_cmtime = t;
 	if (spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_EXTENSIBLE_DATASET)) {
 		objset_t *mos = dd->dd_pool->dp_meta_objset;
 		uint64_t ddobj = dd->dd_object;
 		dsl_dir_zapify(dd, tx);
 		VERIFY0(zap_update(mos, ddobj,
 		    DD_FIELD_SNAPSHOTS_CHANGED,
 		    sizeof (uint64_t),
 		    sizeof (inode_timespec_t) / sizeof (uint64_t),
 		    &t, tx));
 	}
 	mutex_exit(&dd->dd_lock);
 }
 
 void
 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
 }
 
 boolean_t
 dsl_dir_is_zapified(dsl_dir_t *dd)
 {
 	dmu_object_info_t doi;
 
 	dmu_object_info_from_db(dd->dd_dbuf, &doi);
 	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
 
 int
 dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
 	    SPA_FEATURE_LIVELIST));
 	int err = dsl_deadlist_open(&dd->dd_livelist, mos, obj);
 	if (err != 0)
 		return (err);
 	bplist_create(&dd->dd_pending_allocs);
 	bplist_create(&dd->dd_pending_frees);
 	return (0);
 }
 
 void
 dsl_dir_livelist_close(dsl_dir_t *dd)
 {
 	dsl_deadlist_close(&dd->dd_livelist);
 	bplist_destroy(&dd->dd_pending_allocs);
 	bplist_destroy(&dd->dd_pending_frees);
 }
 
 void
 dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
 {
 	uint64_t obj;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	livelist_condense_entry_t to_condense = spa->spa_to_condense;
 
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return;
 
 	/*
 	 * If the livelist being removed is set to be condensed, stop the
 	 * condense zthr and indicate the cancellation in the spa_to_condense
 	 * struct in case the condense no-wait synctask has already started
 	 */
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL &&
 	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
 		/*
 		 * We use zthr_wait_cycle_done instead of zthr_cancel
 		 * because we don't want to destroy the zthr, just have
 		 * it skip its current task.
 		 */
 		spa->spa_to_condense.cancelled = B_TRUE;
 		zthr_wait_cycle_done(ll_condense_thread);
 		/*
 		 * If we've returned from zthr_wait_cycle_done without
 		 * clearing the to_condense data structure it's either
 		 * because the no-wait synctask has started (which is
 		 * indicated by 'syncing' field of to_condense) and we
 		 * can expect it to clear to_condense on its own.
 		 * Otherwise, we returned before the zthr ran. The
 		 * checkfunc will now fail as cancelled == B_TRUE so we
 		 * can safely NULL out ds, allowing a different dir's
 		 * livelist to be condensed.
 		 *
 		 * We can be sure that the to_condense struct will not
 		 * be repopulated at this stage because both this
 		 * function and dsl_livelist_try_condense execute in
 		 * syncing context.
 		 */
 		if ((spa->spa_to_condense.ds != NULL) &&
 		    !spa->spa_to_condense.syncing) {
 			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
 			    spa);
 			spa->spa_to_condense.ds = NULL;
 		}
 	}
 
 	dsl_dir_livelist_close(dd);
 	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
 	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
 	    DD_FIELD_LIVELIST, tx));
 	if (total) {
 		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
 		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	}
 }
 
 static int
 dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
     zfs_wait_activity_t activity, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
 
 	switch (activity) {
 	case ZFS_WAIT_DELETEQ: {
 #ifdef _KERNEL
 		objset_t *os;
 		error = dmu_objset_from_ds(ds, &os);
 		if (error != 0)
 			break;
 
 		mutex_enter(&os->os_user_ptr_lock);
 		void *user = dmu_objset_get_user(os);
 		mutex_exit(&os->os_user_ptr_lock);
 		if (dmu_objset_type(os) != DMU_OST_ZFS ||
 		    user == NULL || zfs_get_vfs_flag_unmounted(os)) {
 			*in_progress = B_FALSE;
 			return (0);
 		}
 
 		uint64_t readonly = B_FALSE;
 		error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
 		    NULL);
 
 		if (error != 0)
 			break;
 
 		if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
 			*in_progress = B_FALSE;
 			return (0);
 		}
 
 		uint64_t count, unlinked_obj;
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 		    &unlinked_obj);
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
 		error = zap_count(os, unlinked_obj, &count);
 
 		if (error == 0)
 			*in_progress = (count != 0);
 		break;
 #else
 		/*
 		 * The delete queue is ZPL specific, and libzpool doesn't have
 		 * it. It doesn't make sense to wait for it.
 		 */
 		(void) ds;
 		*in_progress = B_FALSE;
 		break;
 #endif
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 int
 dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
     boolean_t *waited)
 {
 	int error = 0;
 	boolean_t in_progress;
 	dsl_pool_t *dp = dd->dd_pool;
 	for (;;) {
 		dsl_pool_config_enter(dp, FTAG);
 		error = dsl_dir_activity_in_progress(dd, ds, activity,
 		    &in_progress);
 		dsl_pool_config_exit(dp, FTAG);
 		if (error != 0 || !in_progress)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
 		    0 || dd->dd_activity_cancelled) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 	}
 	return (error);
 }
 
 void
 dsl_dir_cancel_waiters(dsl_dir_t *dd)
 {
 	mutex_enter(&dd->dd_activity_lock);
 	dd->dd_activity_cancelled = B_TRUE;
 	cv_broadcast(&dd->dd_activity_cv);
 	while (dd->dd_activity_waiters > 0)
 		cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
 	mutex_exit(&dd->dd_activity_lock);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dsl_dir_set_quota);
 EXPORT_SYMBOL(dsl_dir_set_reservation);
 #endif
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zvol_enforce_quotas, INT, ZMOD_RW,
 	"Enable strict ZVOL quota enforcment");
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 19fa76931b6e..3eba4cb35cc6 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -1,5348 +1,5347 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2016 Gary Mills
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/dsl_scan.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
 #include <sys/zio_checksum.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
 #include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
 /*
  * Grand theory statement on scan queue sorting
  *
  * Scanning is implemented by recursively traversing all indirection levels
  * in an object and reading all blocks referenced from said objects. This
  * results in us approximately traversing the object from lowest logical
  * offset to the highest. For best performance, we would want the logical
  * blocks to be physically contiguous. However, this is frequently not the
  * case with pools given the allocation patterns of copy-on-write filesystems.
  * So instead, we put the I/Os into a reordering queue and issue them in a
  * way that will most benefit physical disks (LBA-order).
  *
  * Queue management:
  *
  * Ideally, we would want to scan all metadata and queue up all block I/O
  * prior to starting to issue it, because that allows us to do an optimal
  * sorting job. This can however consume large amounts of memory. Therefore
  * we continuously monitor the size of the queues and constrain them to 5%
  * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
  * limit, we clear out a few of the largest extents at the head of the queues
  * to make room for more scanning. Hopefully, these extents will be fairly
  * large and contiguous, allowing us to approach sequential I/O throughput
  * even without a fully sorted tree.
  *
  * Metadata scanning takes place in dsl_scan_visit(), which is called from
  * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
  * metadata on the pool, or we need to make room in memory because our
  * queues are too large, dsl_scan_visit() is postponed and
  * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
  * that metadata scanning and queued I/O issuing are mutually exclusive. This
  * allows us to provide maximum sequential I/O throughput for the majority of
  * I/O's issued since sequential I/O performance is significantly negatively
  * impacted if it is interleaved with random I/O.
  *
  * Implementation Notes
  *
  * One side effect of the queued scanning algorithm is that the scanning code
  * needs to be notified whenever a block is freed. This is needed to allow
  * the scanning code to remove these I/Os from the issuing queue. Additionally,
  * we do not attempt to queue gang blocks to be issued sequentially since this
  * is very hard to do and would have an extremely limited performance benefit.
  * Instead, we simply issue gang I/Os as soon as we find them using the legacy
  * algorithm.
  *
  * Backwards compatibility
  *
  * This new algorithm is backwards compatible with the legacy on-disk data
  * structures (and therefore does not require a new feature flag).
  * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
  * will stop scanning metadata (in logical order) and wait for all outstanding
  * sorted I/O to complete. Once this is done, we write out a checkpoint
  * bookmark, indicating that we have scanned everything logically before it.
  * If the pool is imported on a machine without the new sorting algorithm,
  * the scan simply resumes from the last checkpoint using the legacy algorithm.
  */
 
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
 
 static int scan_ds_queue_compare(const void *a, const void *b);
 static int scan_prefetch_queue_compare(const void *a, const void *b);
 static void scan_ds_queue_clear(dsl_scan_t *scn);
 static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
 static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
     uint64_t *txg);
 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
 static uint64_t dsl_scan_count_data_disks(spa_t *spa);
 static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
 
 /*
  * 'zpool status' uses bytes processed per pass to report throughput and
  * estimate time remaining.  We define a pass to start when the scanning
  * phase completes for a sequential resilver.  Optionally, this value
  * may be used to reset the pass statistics every N txgs to provide an
  * estimated completion time based on currently observed performance.
  */
 static uint_t zfs_scan_report_txgs = 0;
 
 /*
  * By default zfs will check to ensure it is not over the hard memory
  * limit before each txg. If finer-grained control of this is needed
  * this value can be set to 1 to enable checking before scanning each
  * block.
  */
 static int zfs_scan_strict_mem_lim = B_FALSE;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev. We attempt
  * to strike a balance here between keeping the vdev queues full of I/Os
  * at all times and not overflowing the queues to cause long latency,
  * which would cause long txg sync times. No matter what, we will not
  * overload the drives with I/O, since that is protected by
  * zfs_vdev_scrub_max_active.
  */
 static uint64_t zfs_scan_vdev_limit = 16 << 20;
 
 static uint_t zfs_scan_issue_strategy = 0;
 
 /* don't queue & sort zios, go direct */
 static int zfs_scan_legacy = B_FALSE;
 static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
 
 /*
  * fill_weight is non-tunable at runtime, so we copy it at module init from
  * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
  * break queue sorting.
  */
 static uint_t zfs_scan_fill_weight = 3;
 static uint64_t fill_weight;
 
 /* See dsl_scan_should_clear() for details on the memory limit tunables */
 static const uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
 static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
 
 
 /* fraction of physmem */
 static uint_t zfs_scan_mem_lim_fact = 20;
 
 /* fraction of mem lim above */
 static uint_t zfs_scan_mem_lim_soft_fact = 20;
 
 /* minimum milliseconds to scrub per txg */
 static uint_t zfs_scrub_min_time_ms = 1000;
 
 /* minimum milliseconds to obsolete per txg */
 static uint_t zfs_obsolete_min_time_ms = 500;
 
 /* minimum milliseconds to free per txg */
 static uint_t zfs_free_min_time_ms = 1000;
 
 /* minimum milliseconds to resilver per txg */
 static uint_t zfs_resilver_min_time_ms = 3000;
 
 static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
 static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
 static uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 /* max number of dedup blocks to free in a single TXG */
 static uint64_t zfs_max_async_dedup_frees = 100000;
 
 /* set to disable resilver deferring */
 static int zfs_resilver_disable_defer = B_FALSE;
 
 /* Don't defer a resilver if the one in progress only got this far: */
 static uint_t zfs_resilver_defer_percent = 10;
 
 /*
  * We wait a few txgs after importing a pool to begin scanning so that
  * the import / mounting code isn't held up by scrub / resilver IO.
  * Unfortunately, it is a bit difficult to determine exactly how long
  * this will take since userspace will trigger fs mounts asynchronously
  * and the kernel will create zvol minors asynchronously. As a result,
  * the value provided here is a bit arbitrary, but represents a
  * reasonable estimate of how many txgs it will take to finish fully
  * importing a pool
  */
 #define	SCAN_IMPORT_WAIT_TXGS 		5
 
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
 
 #define	DSL_SCAN_IS_SCRUB(scn)		\
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB)
 
 /*
  * Enable/disable the processing of the free_bpobj object.
  */
 static int zfs_free_bpobj_enabled = 1;
 
 /* Error blocks to be scrubbed in one txg. */
 static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
 
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
 	NULL,
 	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
 	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
 };
 
 /* In core node for the scn->scn_queue. Represents a dataset to be scanned */
 typedef struct {
 	uint64_t	sds_dsobj;
 	uint64_t	sds_txg;
 	avl_node_t	sds_node;
 } scan_ds_t;
 
 /*
  * This controls what conditions are placed on dsl_scan_sync_state():
  * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
  * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
  * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
  *	write out the scn_phys_cached version.
  * See dsl_scan_sync_state for details.
  */
 typedef enum {
 	SYNC_OPTIONAL,
 	SYNC_MANDATORY,
 	SYNC_CACHED
 } state_sync_type_t;
 
 /*
  * This struct represents the minimum information needed to reconstruct a
  * zio for sequential scanning. This is useful because many of these will
  * accumulate in the sequential IO queues before being issued, so saving
  * memory matters here.
  */
 typedef struct scan_io {
 	/* fields from blkptr_t */
 	uint64_t		sio_blk_prop;
 	uint64_t		sio_phys_birth;
 	uint64_t		sio_birth;
 	zio_cksum_t		sio_cksum;
 	uint32_t		sio_nr_dvas;
 
 	/* fields from zio_t */
 	uint32_t		sio_flags;
 	zbookmark_phys_t	sio_zb;
 
 	/* members for queue sorting */
 	union {
 		avl_node_t	sio_addr_node; /* link into issuing queue */
 		list_node_t	sio_list_node; /* link for issuing to disk */
 	} sio_nodes;
 
 	/*
 	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
 	 * depending on how many were in the original bp. Only the
 	 * first DVA is really used for sorting and issuing purposes.
 	 * The other DVAs (if provided) simply exist so that the zio
 	 * layer can find additional copies to repair from in the
 	 * event of an error. This array must go at the end of the
 	 * struct to allow this for the variable number of elements.
 	 */
 	dva_t			sio_dva[];
 } scan_io_t;
 
 #define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
 #define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
 #define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
 #define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
 #define	SIO_GET_END_OFFSET(sio)		\
 	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
 #define	SIO_GET_MUSED(sio)		\
 	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
 
 struct dsl_scan_io_queue {
 	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
 	vdev_t		*q_vd; /* top-level vdev that this queue represents */
 	zio_t		*q_zio; /* scn_zio_root child for waiting on IO */
 
 	/* trees used for sorting I/Os and extents of I/Os */
 	range_tree_t	*q_exts_by_addr;
 	zfs_btree_t	q_exts_by_size;
 	avl_tree_t	q_sios_by_addr;
 	uint64_t	q_sio_memused;
 	uint64_t	q_last_ext_addr;
 
 	/* members for zio rate limiting */
 	uint64_t	q_maxinflight_bytes;
 	uint64_t	q_inflight_bytes;
 	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
 
 	/* per txg statistics */
 	uint64_t	q_total_seg_size_this_txg;
 	uint64_t	q_segs_this_txg;
 	uint64_t	q_total_zio_size_this_txg;
 	uint64_t	q_zios_this_txg;
 };
 
 /* private data for dsl_scan_prefetch_cb() */
 typedef struct scan_prefetch_ctx {
 	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
 	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
 	boolean_t spc_root;		/* is this prefetch for an objset? */
 	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
 	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
 } scan_prefetch_ctx_t;
 
 /* private data for dsl_scan_prefetch() */
 typedef struct scan_prefetch_issue_ctx {
 	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
 	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
 	blkptr_t spic_bp;		/* bp to prefetch */
 	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
 } scan_prefetch_issue_ctx_t;
 
 static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
 static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
     scan_io_t *sio);
 
 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
 static void scan_io_queues_destroy(dsl_scan_t *scn);
 
 static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
 
 /* sio->sio_nr_dvas must be set so we know which cache to free from */
 static void
 sio_free(scan_io_t *sio)
 {
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
 }
 
 /* It is up to the caller to set sio->sio_nr_dvas for freeing */
 static scan_io_t *
 sio_alloc(unsigned short nr_dvas)
 {
 	ASSERT3U(nr_dvas, >, 0);
 	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
 }
 
 void
 scan_init(void)
 {
 	/*
 	 * This is used in ext_size_compare() to weight segments
 	 * based on how sparse they are. This cannot be changed
 	 * mid-scan and the tree comparison functions don't currently
 	 * have a mechanism for passing additional context to the
 	 * compare functions. Thus we store this value globally and
 	 * we only allow it to be set at module initialization time
 	 */
 	fill_weight = zfs_scan_fill_weight;
 
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		char name[36];
 
 		(void) snprintf(name, sizeof (name), "sio_cache_%d", i);
 		sio_cache[i] = kmem_cache_create(name,
 		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
 		    0, NULL, NULL, NULL, NULL, NULL, 0);
 	}
 }
 
 void
 scan_fini(void)
 {
 	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 		kmem_cache_destroy(sio_cache[i]);
 	}
 }
 
 static inline boolean_t
 dsl_scan_is_running(const dsl_scan_t *scn)
 {
 	return (scn->scn_phys.scn_state == DSS_SCANNING);
 }
 
 boolean_t
 dsl_scan_resilvering(dsl_pool_t *dp)
 {
 	return (dsl_scan_is_running(dp->dp_scan) &&
 	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
 }
 
 static inline void
 sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
 	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
 	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
 	ASSERT3U(sio->sio_nr_dvas, >, 0);
 	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
 
 	memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t));
 }
 
 static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
 	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
 	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
 	/*
 	 * Copy the DVAs to the sio. We need all copies of the block so
 	 * that the self healing code can use the alternate copies if the
 	 * first is corrupted. We want the DVA at index dva_i to be first
 	 * in the sio since this is the primary one that we want to issue.
 	 */
 	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
 		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
 	}
 }
 
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
 	int err;
 	dsl_scan_t *scn;
 	spa_t *spa = dp->dp_spa;
 	uint64_t f;
 
 	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
 	scn->scn_dp = dp;
 
 	/*
 	 * It's possible that we're resuming a scan after a reboot so
 	 * make sure that the scan_async_destroying flag is initialized
 	 * appropriately.
 	 */
 	ASSERT(!scn->scn_async_destroying);
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
 	/*
 	 * Calculate the max number of in-flight bytes for pool-wide
 	 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 	 * Limits for the issuing phase are done per top-level vdev and
 	 * are handled separately.
 	 */
 	scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 	    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
 	mutex_init(&scn->scn_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
 	    sizeof (scan_prefetch_issue_ctx_t),
 	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    "scrub_func", sizeof (uint64_t), 1, &f);
 	if (err == 0) {
 		/*
 		 * There was an old-style scrub in progress.  Restart a
 		 * new-style scrub from the beginning.
 		 */
 		scn->scn_restart_txg = txg;
 		zfs_dbgmsg("old-style scrub was in progress for %s; "
 		    "restarting new-style scrub in txg %llu",
 		    spa->spa_name,
 		    (longlong_t)scn->scn_restart_txg);
 
 		/*
 		 * Load the queue obj from the old location so that it
 		 * can be freed by dsl_scan_done().
 		 */
 		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    "scrub_queue", sizeof (uint64_t), 1,
 		    &scn->scn_phys.scn_queue_obj);
 	} else {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
 		    ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
 
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys);
 
 		/*
 		 * Detect if the pool contains the signature of #2094.  If it
 		 * does properly update the scn->scn_phys structure and notify
 		 * the administrator by setting an errata for the pool.
 		 */
 		if (err == EOVERFLOW) {
 			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
 			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
 			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
 			    (23 * sizeof (uint64_t)));
 
 			err = zap_lookup(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
 			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
 			if (err == 0) {
 				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
 
 				if (overflow & ~DSL_SCAN_FLAGS_MASK ||
 				    scn->scn_async_destroying) {
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
 					return (EOVERFLOW);
 				}
 
 				memcpy(&scn->scn_phys, zaptmp,
 				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
 				scn->scn_phys.scn_flags = overflow;
 
 				/* Required scrub already in progress. */
 				if (scn->scn_phys.scn_state == DSS_FINISHED ||
 				    scn->scn_phys.scn_state == DSS_CANCELED)
 					spa->spa_errata =
 					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
 			}
 		}
 
 		if (err == ENOENT)
 			return (0);
 		else if (err)
 			return (err);
 
 		/*
 		 * We might be restarting after a reboot, so jump the issued
 		 * counter to how far we've scanned. We know we're consistent
 		 * up to here.
 		 */
 		scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
 		    scn->scn_phys.scn_skipped;
 
 		if (dsl_scan_is_running(scn) &&
 		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
 			/*
 			 * A new-type scrub was in progress on an old
 			 * pool, and the pool was accessed by old
 			 * software.  Restart from the beginning, since
 			 * the old software may have changed the pool in
 			 * the meantime.
 			 */
 			scn->scn_restart_txg = txg;
 			zfs_dbgmsg("new-style scrub for %s was modified "
 			    "by old software; restarting in txg %llu",
 			    spa->spa_name,
 			    (longlong_t)scn->scn_restart_txg);
 		} else if (dsl_scan_resilvering(dp)) {
 			/*
 			 * If a resilver is in progress and there are already
 			 * errors, restart it instead of finishing this scan and
 			 * then restarting it. If there haven't been any errors
 			 * then remember that the incore DTL is valid.
 			 */
 			if (scn->scn_phys.scn_errors > 0) {
 				scn->scn_restart_txg = txg;
 				zfs_dbgmsg("resilver can't excise DTL_MISSING "
 				    "when finished; restarting on %s in txg "
 				    "%llu",
 				    spa->spa_name,
 				    (u_longlong_t)scn->scn_restart_txg);
 			} else {
 				/* it's safe to excise DTL when finished */
 				spa->spa_scrub_started = B_TRUE;
 			}
 		}
 	}
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	/* reload the queue into the in-core state */
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			scan_ds_queue_insert(scn,
 			    zfs_strtonum(za->za_name, NULL),
 			    za->za_first_integer);
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
 
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	return (0);
 }
 
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
 	if (dp->dp_scan != NULL) {
 		dsl_scan_t *scn = dp->dp_scan;
 
 		if (scn->scn_taskq != NULL)
 			taskq_destroy(scn->scn_taskq);
 
 		scan_ds_queue_clear(scn);
 		avl_destroy(&scn->scn_queue);
 		mutex_destroy(&scn->scn_queue_lock);
 		scan_ds_prefetch_queue_clear(scn);
 		avl_destroy(&scn->scn_prefetch_queue);
 
 		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
 		dp->dp_scan = NULL;
 	}
 }
 
 static boolean_t
 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	return (scn->scn_restart_txg != 0 &&
 	    scn->scn_restart_txg <= tx->tx_txg);
 }
 
 boolean_t
 dsl_scan_resilver_scheduled(dsl_pool_t *dp)
 {
 	return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
 	    (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
 }
 
 boolean_t
 dsl_scan_scrubbing(const dsl_pool_t *dp)
 {
 	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
 
 	return (scn_phys->scn_state == DSS_SCANNING &&
 	    scn_phys->scn_func == POOL_SCAN_SCRUB);
 }
 
 boolean_t
 dsl_errorscrubbing(const dsl_pool_t *dp)
 {
 	dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
 
 	return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
 	    errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
 }
 
 boolean_t
 dsl_errorscrub_is_paused(const dsl_scan_t *scn)
 {
 	return (dsl_errorscrubbing(scn->scn_dp) &&
 	    scn->errorscrub_phys.dep_paused_flags);
 }
 
 boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
 	return (dsl_scan_scrubbing(scn->scn_dp) &&
 	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
 static void
 dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scn->errorscrub_phys.dep_cursor =
 	    zap_cursor_serialize(&scn->errorscrub_cursor);
 
 	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
 	    &scn->errorscrub_phys, tx));
 }
 
 static void
 dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	pool_scan_func_t *funcp = arg;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
 
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	scn->errorscrub_phys.dep_func = *funcp;
 	scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
 	scn->errorscrub_phys.dep_start_time = gethrestime_sec();
 	scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
 	scn->errorscrub_phys.dep_examined = 0;
 	scn->errorscrub_phys.dep_errors = 0;
 	scn->errorscrub_phys.dep_cursor = 0;
 	zap_cursor_init_serialized(&scn->errorscrub_cursor,
 	    spa->spa_meta_objset, spa->spa_errlog_last,
 	    scn->errorscrub_phys.dep_cursor);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
 
 	dsl_errorscrub_sync_state(scn, tx);
 
 	spa_history_log_internal(spa, "error scrub setup", tx,
 	    "func=%u mintxg=%u maxtxg=%llu",
 	    *funcp, 0, (u_longlong_t)tx->tx_txg);
 }
 
 static int
 dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
 		return (SET_ERROR(EBUSY));
 	}
 
 	if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
 		return (ECANCELED);
 	}
 	return (0);
 }
 
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
  * want to write out the record, only when it is "safe" to do so. This safety
  * condition is achieved by making sure that the sorting queues are empty
  * (scn_queues_pending == 0). When this condition is not true, the sync'd state
  * is inconsistent with how much actual scanning progress has been made. The
  * kind of sync to be performed is specified by the sync_type argument. If the
  * sync is optional, we only sync if the queues are empty. If the sync is
  * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
  * third possible state is a "cached" sync. This is done in response to:
  * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	destroyed, so we wouldn't be able to restart scanning from it.
  * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
  *	superseded by a newer snapshot.
  * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
  *	swapped with its clone.
  * In all cases, a cached sync simply rewrites the last record we've written,
  * just slightly modified. For the modifications that are performed to the
  * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
  * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
  */
 static void
 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
 {
 	int i;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
 	if (scn->scn_queues_pending == 0) {
 		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
 
 			if (q == NULL)
 				continue;
 
 			mutex_enter(&vd->vdev_scan_io_queue_lock);
 			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
 			ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
 			    NULL);
 			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
 			mutex_exit(&vd->vdev_scan_io_queue_lock);
 		}
 
 		if (scn->scn_phys.scn_queue_obj != 0)
 			scan_ds_queue_sync(scn, tx);
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys, tx));
 		memcpy(&scn->scn_phys_cached, &scn->scn_phys,
 		    sizeof (scn->scn_phys));
 
 		if (scn->scn_checkpointing)
 			zfs_dbgmsg("finish scan checkpoint for %s",
 			    spa->spa_name);
 
 		scn->scn_checkpointing = B_FALSE;
 		scn->scn_last_checkpoint = ddi_get_lbolt();
 	} else if (sync_type == SYNC_CACHED) {
 		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
 		    &scn->scn_phys_cached, tx));
 	}
 }
 
 int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
 	    dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
 	setup_sync_arg_t *setup_sync_arg = (setup_sync_arg_t *)arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	dmu_object_type_t ot = 0;
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!dsl_scan_is_running(scn));
 	ASSERT3U(setup_sync_arg->func, >, POOL_SCAN_NONE);
 	ASSERT3U(setup_sync_arg->func, <, POOL_SCAN_FUNCS);
 	memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
 
 	/*
 	 * If we are starting a fresh scrub, we erase the error scrub
 	 * information from disk.
 	 */
 	memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
 	dsl_errorscrub_sync_state(scn, tx);
 
 	scn->scn_phys.scn_func = setup_sync_arg->func;
 	scn->scn_phys.scn_state = DSS_SCANNING;
 	scn->scn_phys.scn_min_txg = setup_sync_arg->txgstart;
 	if (setup_sync_arg->txgend == 0) {
 		scn->scn_phys.scn_max_txg = tx->tx_txg;
 	} else {
 		scn->scn_phys.scn_max_txg = setup_sync_arg->txgend;
 	}
 	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
 	scn->scn_phys.scn_start_time = gethrestime_sec();
 	scn->scn_phys.scn_errors = 0;
 	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
 	scn->scn_issued_before_pass = 0;
 	scn->scn_restart_txg = 0;
 	scn->scn_done_txg = 0;
 	scn->scn_last_checkpoint = 0;
 	scn->scn_checkpointing = B_FALSE;
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
 
 		/* rewrite all disk labels */
 		vdev_config_dirty(spa->spa_root_vdev);
 
 		if (vdev_resilver_needed(spa->spa_root_vdev,
 		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
 			nvlist_t *aux = fnvlist_alloc();
 			fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 			    "healing");
 			spa_event_notify(spa, NULL, aux,
 			    ESC_ZFS_RESILVER_START);
 			nvlist_free(aux);
 		} else {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
 		}
 
 		spa->spa_scrub_started = B_TRUE;
 		/*
 		 * If this is an incremental scrub, limit the DDT scrub phase
 		 * to just the auto-ditto class (for correctness); the rest
 		 * of the scrub should go faster using top-down pruning.
 		 */
 		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
 			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
 		/*
 		 * When starting a resilver clear any existing rebuild state.
 		 * This is required to prevent stale rebuild status from
 		 * being reported when a rebuild is run, then a resilver and
 		 * finally a scrub.  In which case only the scrub status
 		 * should be reported by 'zpool status'.
 		 */
 		if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
 			vdev_t *rvd = spa->spa_root_vdev;
 			for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 				vdev_t *vd = rvd->vdev_child[i];
 				vdev_rebuild_clear_sync(
 				    (void *)(uintptr_t)vd->vdev_id, tx);
 			}
 		}
 	}
 
 	/* back to the generic stuff */
 
 	if (zfs_scan_blkstats) {
 		if (dp->dp_blkstats == NULL) {
 			dp->dp_blkstats =
 			    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
 		}
 		memset(&dp->dp_blkstats->zab_type, 0,
 		    sizeof (dp->dp_blkstats->zab_type));
 	} else {
 		if (dp->dp_blkstats) {
 			vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 			dp->dp_blkstats = NULL;
 		}
 	}
 
 	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
 		ot = DMU_OT_ZAP_OTHER;
 
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
 	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
 
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
 	    "func=%u mintxg=%llu maxtxg=%llu",
 	    setup_sync_arg->func, (u_longlong_t)scn->scn_phys.scn_min_txg,
 	    (u_longlong_t)scn->scn_phys.scn_max_txg);
 }
 
 /*
  * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
  * error scrub or resilver. Can also be called to resume a paused scrub or
  * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func, uint64_t txgstart,
     uint64_t txgend)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 	setup_sync_arg_t setup_sync_arg;
 
 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) {
 		return (EINVAL);
 	}
 
 	/*
 	 * Purge all vdev caches and probe all devices.  We do this here
 	 * rather than in sync context because this requires a writer lock
 	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
 	spa_vdev_state_enter(spa, SCL_NONE);
 	spa->spa_scrub_reopen = B_TRUE;
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (func == POOL_SCAN_RESILVER) {
 		dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB) {
 		if (dsl_errorscrub_is_paused(dp->dp_scan)) {
 			/*
 			 * got error scrub start cmd, resume paused error scrub.
 			 */
 			int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 			    POOL_SCRUB_NORMAL);
 			if (err == 0) {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_ERRORSCRUB_RESUME);
 				return (ECANCELED);
 			}
 			return (SET_ERROR(err));
 		}
 
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
 		    &func, 0, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
 		    POOL_SCRUB_NORMAL);
 		if (err == 0) {
 			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
 			return (SET_ERROR(ECANCELED));
 		}
 		return (SET_ERROR(err));
 	}
 
 	setup_sync_arg.func = func;
 	setup_sync_arg.txgstart = txgstart;
 	setup_sync_arg.txgend = txgend;
 
 	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
 	    dsl_scan_setup_sync, &setup_sync_arg, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 static void
 dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 
 	if (complete) {
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
 		spa_history_log_internal(spa, "error scrub done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else {
 		spa_history_log_internal(spa, "error scrub canceled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	}
 
 	scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
 	spa->spa_scrub_active = B_FALSE;
 	spa_errlog_rotate(spa);
 	scn->errorscrub_phys.dep_end_time = gethrestime_sec();
 	zap_cursor_fini(&scn->errorscrub_cursor);
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_errorscrubbing(scn->scn_dp));
 }
 
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
 	static const char *old_names[] = {
 		"scrub_bookmark",
 		"scrub_ddt_bookmark",
 		"scrub_ddt_class_max",
 		"scrub_queue",
 		"scrub_min_txg",
 		"scrub_max_txg",
 		"scrub_func",
 		"scrub_errors",
 		NULL
 	};
 
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int i;
 
 	/* Remove any remnants of an old-style scrub. */
 	for (i = 0; old_names[i]; i++) {
 		(void) zap_remove(dp->dp_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
 	}
 
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		VERIFY0(dmu_object_free(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, tx));
 		scn->scn_phys.scn_queue_obj = 0;
 	}
 	scan_ds_queue_clear(scn);
 	scan_ds_prefetch_queue_clear(scn);
 
 	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 
 	/*
 	 * If we were "restarted" from a stopped state, don't bother
 	 * with anything else.
 	 */
 	if (!dsl_scan_is_running(scn)) {
 		ASSERT(!scn->scn_is_sorted);
 		return;
 	}
 
 	if (scn->scn_is_sorted) {
 		scan_io_queues_destroy(scn);
 		scn->scn_is_sorted = B_FALSE;
 
 		if (scn->scn_taskq != NULL) {
 			taskq_destroy(scn->scn_taskq);
 			scn->scn_taskq = NULL;
 		}
 	}
 
 	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
 
 	spa_notify_waiters(spa);
 
 	if (dsl_scan_restarting(scn, tx)) {
 		spa_history_log_internal(spa, "scan aborted, restarting", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else if (!complete) {
 		spa_history_log_internal(spa, "scan cancelled", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 	} else {
 		spa_history_log_internal(spa, "scan done", tx,
 		    "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 		if (DSL_SCAN_IS_SCRUB(scn)) {
 			VERIFY0(zap_update(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_LAST_SCRUBBED_TXG,
 			    sizeof (uint64_t), 1,
 			    &scn->scn_phys.scn_max_txg, tx));
 			spa->spa_scrubbed_last_txg = scn->scn_phys.scn_max_txg;
 		}
 	}
 
 	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
 		spa->spa_scrub_active = B_FALSE;
 
 		/*
 		 * If the scrub/resilver completed, update all DTLs to
 		 * reflect this.  Whether it succeeded or not, vacate
 		 * all temporary scrub DTLs.
 		 *
 		 * As the scrub does not currently support traversing
 		 * data that have been freed but are part of a checkpoint,
 		 * we don't mark the scrub as done in the DTLs as faults
 		 * may still exist in those vdevs.
 		 */
 		if (complete &&
 		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
 
 			if (scn->scn_phys.scn_min_txg) {
 				nvlist_t *aux = fnvlist_alloc();
 				fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
 				    "healing");
 				spa_event_notify(spa, NULL, aux,
 				    ESC_ZFS_RESILVER_FINISH);
 				nvlist_free(aux);
 			} else {
 				spa_event_notify(spa, NULL, NULL,
 				    ESC_ZFS_SCRUB_FINISH);
 			}
 		} else {
 			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
 			    0, B_TRUE, B_FALSE);
 		}
 		spa_errlog_rotate(spa);
 
 		/*
 		 * Don't clear flag until after vdev_dtl_reassess to ensure that
 		 * DTL_MISSING will get updated when possible.
 		 */
 		spa->spa_scrub_started = B_FALSE;
 
 		/*
 		 * We may have finished replacing a device.
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		/*
 		 * Clear any resilver_deferred flags in the config.
 		 * If there are drives that need resilvering, kick
 		 * off an asynchronous request to start resilver.
 		 * vdev_clear_resilver_deferred() may update the config
 		 * before the resilver can restart. In the event of
 		 * a crash during this period, the spa loading code
 		 * will find the drives that need to be resilvered
 		 * and start the resilver then.
 		 */
 		if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
 		    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
 			spa_history_log_internal(spa,
 			    "starting deferred resilver", tx, "errors=%llu",
 			    (u_longlong_t)spa_approx_errlog_size(spa));
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/* Clear recent error events (i.e. duplicate events tracking) */
 		if (complete)
 			zfs_ereport_clear(spa, NULL);
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
 
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
 		spa->spa_errata = 0;
 
 	ASSERT(!dsl_scan_is_running(scn));
 }
 
 static int
 dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/*
 		 * can't pause a error scrub when there is no in-progress
 		 * error scrub.
 		 */
 		if (!dsl_errorscrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused error scrub */
 		if (dsl_errorscrub_is_paused(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
 		scn->errorscrub_phys.dep_paused_flags = B_TRUE;
 		dsl_errorscrub_sync_state(scn, tx);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_errorscrub_is_paused(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the error scrub
 			 * rate shown in the output of 'zpool status'.
 			 */
 			spa->spa_scan_pass_errorscrub_spent_paused +=
 			    gethrestime_sec() -
 			    spa->spa_scan_pass_errorscrub_pause;
 
 			spa->spa_scan_pass_errorscrub_pause = 0;
 			scn->errorscrub_phys.dep_paused_flags = B_FALSE;
 
 			zap_cursor_init_serialized(
 			    &scn->errorscrub_cursor,
 			    spa->spa_meta_objset, spa->spa_errlog_last,
 			    scn->errorscrub_phys.dep_cursor);
 
 			dsl_errorscrub_sync_state(scn, tx);
 		}
 	}
 }
 
 static int
 dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 	/* can't cancel a error scrub when there is no one in-progress */
 	if (!dsl_errorscrubbing(scn->scn_dp))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_errorscrub_done(scn, B_FALSE, tx);
 	dsl_errorscrub_sync_state(scn, tx);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
 	    ESC_ZFS_ERRORSCRUB_ABORT);
 }
 
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	if (!dsl_scan_is_running(scn))
 		return (SET_ERROR(ENOENT));
 	return (0);
 }
 
 static void
 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
 	dsl_scan_done(scn, B_FALSE, tx);
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
 }
 
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
 		    NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
 	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 static int
 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		if (!dsl_scan_scrubbing(dp))
 			return (SET_ERROR(ENOENT));
 
 		/* can't pause a paused scrub */
 		if (dsl_scan_is_paused_scrub(scn))
 			return (SET_ERROR(EBUSY));
 	} else if (*cmd != POOL_SCRUB_NORMAL) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 static void
 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 {
 	pool_scrub_cmd_t *cmd = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (*cmd == POOL_SCRUB_PAUSE) {
 		/* can't pause a scrub when there is no in-progress scrub */
 		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
 		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
 		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
 		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
 		spa_notify_waiters(spa);
 	} else {
 		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
 		if (dsl_scan_is_paused_scrub(scn)) {
 			/*
 			 * We need to keep track of how much time we spend
 			 * paused per pass so that we can adjust the scrub rate
 			 * shown in the output of 'zpool status'
 			 */
 			spa->spa_scan_pass_scrub_spent_paused +=
 			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
 			spa->spa_scan_pass_scrub_pause = 0;
 			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
 			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		}
 	}
 }
 
 /*
  * Set scrub pause/resume state if it makes sense to do so
  */
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
 	if (dsl_errorscrubbing(dp)) {
 		return (dsl_sync_task(spa_name(dp->dp_spa),
 		    dsl_errorscrub_pause_resume_check,
 		    dsl_errorscrub_pause_resume_sync, &cmd, 3,
 		    ZFS_SPACE_CHECK_RESERVED));
 	}
 	return (dsl_sync_task(spa_name(dp->dp_spa),
 	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
 	    ZFS_SPACE_CHECK_RESERVED));
 }
 
 
 /* start a new scan, or restart an existing one. */
 void
 dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
 {
 	if (txg == 0) {
 		dmu_tx_t *tx;
 		tx = dmu_tx_create_dd(dp->dp_mos_dir);
 		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
 		txg = dmu_tx_get_txg(tx);
 		dp->dp_scan->scn_restart_txg = txg;
 		dmu_tx_commit(tx);
 	} else {
 		dp->dp_scan->scn_restart_txg = txg;
 	}
 	zfs_dbgmsg("restarting resilver for %s at txg=%llu",
 	    dp->dp_spa->spa_name, (longlong_t)txg);
 }
 
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
 	zio_free(dp->dp_spa, txg, bp);
 }
 
 void
 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	ASSERT(dsl_pool_sync_context(dp));
 	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 }
 
 static int
 scan_ds_queue_compare(const void *a, const void *b)
 {
 	const scan_ds_t *sds_a = a, *sds_b = b;
 
 	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
 		return (-1);
 	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
 		return (0);
 	return (1);
 }
 
 static void
 scan_ds_queue_clear(dsl_scan_t *scn)
 {
 	void *cookie = NULL;
 	scan_ds_t *sds;
 	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
 		kmem_free(sds, sizeof (*sds));
 	}
 }
 
 static boolean_t
 scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	if (sds != NULL && txg != NULL)
 		*txg = sds->sds_txg;
 	return (sds != NULL);
 }
 
 static void
 scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
 {
 	scan_ds_t *sds;
 	avl_index_t where;
 
 	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
 	sds->sds_dsobj = dsobj;
 	sds->sds_txg = txg;
 
 	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
 	avl_insert(&scn->scn_queue, sds, where);
 }
 
 static void
 scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
 {
 	scan_ds_t srch, *sds;
 
 	srch.sds_dsobj = dsobj;
 
 	sds = avl_find(&scn->scn_queue, &srch, NULL);
 	VERIFY(sds != NULL);
 	avl_remove(&scn->scn_queue, sds);
 	kmem_free(sds, sizeof (*sds));
 }
 
 static void
 scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
 	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
 
 	ASSERT0(scn->scn_queues_pending);
 	ASSERT(scn->scn_phys.scn_queue_obj != 0);
 
 	VERIFY0(dmu_object_free(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, tx));
 	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
 	    DMU_OT_NONE, 0, tx);
 	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
 	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
 		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
 		    sds->sds_txg, tx));
 	}
 }
 
 /*
  * Computes the memory limit state that we're currently in. A sorted scan
  * needs quite a bit of memory to hold the sorting queue, so we need to
  * reasonably constrain the size so it doesn't impact overall system
  * performance. We compute two limits:
  * 1) Hard memory limit: if the amount of memory used by the sorting
  *	queues on a pool gets above this value, we stop the metadata
  *	scanning portion and start issuing the queued up and sorted
  *	I/Os to reduce memory usage.
  *	This limit is calculated as a fraction of physmem (by default 5%).
  *	We constrain the lower bound of the hard limit to an absolute
  *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
  *	the upper bound to 5% of the total pool size - no chance we'll
  *	ever need that much memory, but just to keep the value in check.
  * 2) Soft memory limit: once we hit the hard memory limit, we start
  *	issuing I/O to reduce queue memory usage, but we don't want to
  *	completely empty out the queues, since we might be able to find I/Os
  *	that will fill in the gaps of our non-sequential IOs at some point
  *	in the future. So we stop the issuing of I/Os once the amount of
  *	memory used drops below the soft limit (at which point we stop issuing
  *	I/O and start scanning metadata again).
  *
  *	This limit is calculated by subtracting a fraction of the hard
  *	limit from the hard limit. By default this fraction is 5%, so
  *	the soft limit is 95% of the hard limit. We cap the size of the
  *	difference between the hard and soft limits at an absolute
  *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
  *	sufficient to not cause too frequent switching between the
  *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
  *	worth of queues is about 1.2 GiB of on-pool data, so scanning
  *	that should take at least a decent fraction of a second).
  */
 static boolean_t
 dsl_scan_should_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 	uint64_t alloc, mlim_hard, mlim_soft, mused;
 
 	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	alloc += metaslab_class_get_alloc(spa_special_class(spa));
 	alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 
 	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
 	    zfs_scan_mem_lim_min);
 	mlim_hard = MIN(mlim_hard, alloc / 20);
 	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
 	    zfs_scan_mem_lim_soft_max);
 	mused = 0;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 		dsl_scan_io_queue_t *queue;
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		queue = tvd->vdev_scan_io_queue;
 		if (queue != NULL) {
 			/*
 			 * # of extents in exts_by_addr = # in exts_by_size.
 			 * B-tree efficiency is ~75%, but can be as low as 50%.
 			 */
 			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
 			    ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
 			    3 / 2) + queue->q_sio_memused;
 		}
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 
 	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
 
 	if (mused == 0)
 		ASSERT0(scn->scn_queues_pending);
 
 	/*
 	 * If we are above our hard limit, we need to clear out memory.
 	 * If we are below our soft limit, we need to accumulate sequential IOs.
 	 * Otherwise, we should keep doing whatever we are currently doing.
 	 */
 	if (mused >= mlim_hard)
 		return (B_TRUE);
 	else if (mused < mlim_soft)
 		return (B_FALSE);
 	else
 		return (scn->scn_clearing);
 }
 
 static boolean_t
 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/* we never skip user/group accounting objects */
 	if (zb && (int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	if (scn->scn_suspending)
 		return (B_TRUE); /* we're already suspending */
 
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 and objset blocks. */
 	if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
 		return (B_FALSE);
 
 	/*
 	 * We suspend if:
 	 *  - we have scanned for at least the minimum time (default 1 sec
 	 *    for scrub, 3 sec for resilver), and either we have sufficient
 	 *    dirty data that we are starting to write more quickly
 	 *    (default 30%), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 *  or
 	 *  - the scan queue has reached its memory use limit
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
 	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
 	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
 			    zb->zb_objset, 0, 0, 0);
 		} else if (zb != NULL) {
 			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			scn->scn_phys.scn_bookmark = *zb;
 		} else {
 #ifdef ZFS_DEBUG
 			dsl_scan_phys_t *scnp = &scn->scn_phys;
 			dprintf("suspending at at DDT bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 #endif
 		}
 		scn->scn_suspending = B_TRUE;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 {
 	/*
 	 * We suspend if:
 	 *  - we have scrubbed for at least the minimum time (default 1 sec
 	 *    for error scrub), someone is explicitly waiting for this txg
 	 *    to complete, or we have used up all of the time in the txg
 	 *    timeout (default 5 sec).
 	 *  or
 	 *  - the spa is shutting down because this pool is being exported
 	 *    or the machine is rebooting.
 	 */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	int mintime = zfs_scrub_min_time_ms;
 
 	if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
 	    (txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa)) {
 		if (zb) {
 			dprintf("error scrub suspending at bookmark "
 			    "%llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 		}
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 typedef struct zil_scan_arg {
 	dsl_pool_t	*zsa_dp;
 	zil_header_t	*zsa_zh;
 } zil_scan_arg_t;
 
 static int
 dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	zil_scan_arg_t *zsa = arg;
 	dsl_pool_t *dp = zsa->zsa_dp;
 	dsl_scan_t *scn = dp->dp_scan;
 	zil_header_t *zh = zsa->zsa_zh;
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 	if (BP_IS_HOLE(bp) ||
 	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
 	 * One block ("stubby") can be allocated a long time ago; we
 	 * want to visit that one because it has been allocated
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
 	if (claim_txg == 0 &&
 	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	return (0);
 }
 
 static int
 dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
     uint64_t claim_txg)
 {
 	(void) zilog;
 	if (lrc->lrc_txtype == TX_WRITE) {
 		zil_scan_arg_t *zsa = arg;
 		dsl_pool_t *dp = zsa->zsa_dp;
 		dsl_scan_t *scn = dp->dp_scan;
 		zil_header_t *zh = zsa->zsa_zh;
 		const lr_write_t *lr = (const lr_write_t *)lrc;
 		const blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
 		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
 		 * birth can be < claim_txg if this record's txg is
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
 		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    lr->lr_foid, ZB_ZIL_LEVEL,
 		    lr->lr_offset / BP_GET_LSIZE(bp));
 
 		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
 	}
 	return (0);
 }
 
 static void
 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zil_scan_arg_t zsa = { dp, zh };
 	zilog_t *zilog;
 
 	ASSERT(spa_writeable(dp->dp_spa));
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
 	if (claim_txg == 0)
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
 
 	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
 	    claim_txg, B_FALSE);
 
 	zil_free(zilog);
 }
 
 /*
  * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
  * here is to sort the AVL tree by the order each block will be needed.
  */
 static int
 scan_prefetch_queue_compare(const void *a, const void *b)
 {
 	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
 	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
 	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
 
 	return (zbookmark_compare(spc_a->spc_datablkszsec,
 	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
 	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
 }
 
 static void
 scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
 		zfs_refcount_destroy(&spc->spc_refcnt);
 		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
 	}
 }
 
 static scan_prefetch_ctx_t *
 scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag)
 {
 	scan_prefetch_ctx_t *spc;
 
 	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
 	zfs_refcount_create(&spc->spc_refcnt);
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 	spc->spc_scn = scn;
 	if (dnp != NULL) {
 		spc->spc_datablkszsec = dnp->dn_datablkszsec;
 		spc->spc_indblkshift = dnp->dn_indblkshift;
 		spc->spc_root = B_FALSE;
 	} else {
 		spc->spc_datablkszsec = 0;
 		spc->spc_indblkshift = 0;
 		spc->spc_root = B_TRUE;
 	}
 
 	return (spc);
 }
 
 static void
 scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag)
 {
 	zfs_refcount_add(&spc->spc_refcnt, tag);
 }
 
 static void
 scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	void *cookie = NULL;
 	scan_prefetch_issue_ctx_t *spic = NULL;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
 	    &cookie)) != NULL) {
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
     const zbookmark_phys_t *zb)
 {
 	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
 	dnode_phys_t tmp_dnp;
 	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
 
 	if (zb->zb_objset != last_zb->zb_objset)
 		return (B_TRUE);
 	if ((int64_t)zb->zb_object < 0)
 		return (B_FALSE);
 
 	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
 	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
 
 	if (zbookmark_subtree_completed(dnp, zb, last_zb))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static void
 dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
 	if (BP_IS_HOLE(bp) ||
 	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
 
 	if (dsl_scan_check_prefetch_resume(spc, zb))
 		return;
 
 	scan_prefetch_ctx_add_ref(spc, scn);
 	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
 	spic->spic_spc = spc;
 	spic->spic_bp = *bp;
 	spic->spic_zb = *zb;
 
 	/*
 	 * Add the IO to the queue of blocks to prefetch. This allows us to
 	 * prioritize blocks that we will need first for the main traversal
 	 * thread.
 	 */
 	mutex_enter(&spa->spa_scrub_lock);
 	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
 		/* this block is already queued for prefetch */
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 		scan_prefetch_ctx_rele(spc, scn);
 		mutex_exit(&spa->spa_scrub_lock);
 		return;
 	}
 
 	avl_insert(&scn->scn_prefetch_queue, spic, idx);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static void
 dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int i;
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 		return;
 
 	SET_BOOKMARK(&zb, objset, object, 0, 0);
 
 	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
 		zb.zb_blkid = i;
 		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zb.zb_level = 0;
 		zb.zb_blkid = DMU_SPILL_BLKID;
 		dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
 	}
 
 	scan_prefetch_ctx_rele(spc, FTAG);
 }
 
 static void
 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *private)
 {
 	(void) zio;
 	scan_prefetch_ctx_t *spc = private;
 	dsl_scan_t *scn = spc->spc_scn;
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	/* broadcast that the IO has completed for rate limiting purposes */
 	mutex_enter(&spa->spa_scrub_lock);
 	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 
 	/* if there was an error or we are done prefetching, just cleanup */
 	if (buf == NULL || scn->scn_prefetch_stop)
 		goto out;
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		zbookmark_phys_t czb;
 
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1, zb->zb_blkid * epb + i);
 			dsl_scan_prefetch(spc, cbp, &czb);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_prefetch_dnode(scn, cdnp,
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		objset_phys_t *osp = buf->b_data;
 
 		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
 		    zb->zb_objset, DMU_META_DNODE_OBJECT);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
 				dsl_scan_prefetch_dnode(scn,
 				    &osp->os_projectused_dnode, zb->zb_objset,
 				    DMU_PROJECTUSED_OBJECT);
 			}
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_groupused_dnode, zb->zb_objset,
 			    DMU_GROUPUSED_OBJECT);
 			dsl_scan_prefetch_dnode(scn,
 			    &osp->os_userused_dnode, zb->zb_objset,
 			    DMU_USERUSED_OBJECT);
 		}
 	}
 
 out:
 	if (buf != NULL)
 		arc_buf_destroy(buf, private);
 	scan_prefetch_ctx_rele(spc, scn);
 }
 
 static void
 dsl_scan_prefetch_thread(void *arg)
 {
 	dsl_scan_t *scn = arg;
 	spa_t *spa = scn->scn_dp->dp_spa;
 	scan_prefetch_issue_ctx_t *spic;
 
 	/* loop until we are told to stop */
 	while (!scn->scn_prefetch_stop) {
 		arc_flags_t flags = ARC_FLAG_NOWAIT |
 		    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
 		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 
 		mutex_enter(&spa->spa_scrub_lock);
 
 		/*
 		 * Wait until we have an IO to issue and are not above our
 		 * maximum in flight limit.
 		 */
 		while (!scn->scn_prefetch_stop &&
 		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
 		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		}
 
 		/* recheck if we should stop since we waited for the cv */
 		if (scn->scn_prefetch_stop) {
 			mutex_exit(&spa->spa_scrub_lock);
 			break;
 		}
 
 		/* remove the prefetch IO from the tree */
 		spic = avl_first(&scn->scn_prefetch_queue);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
 		avl_remove(&scn->scn_prefetch_queue, spic);
 
 		mutex_exit(&spa->spa_scrub_lock);
 
 		if (BP_IS_PROTECTED(&spic->spic_bp)) {
 			ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
 			    BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
 			ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		/* We don't need data L1 buffer since we do not prefetch L0. */
 		blkptr_t *bp = &spic->spic_bp;
 		if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			flags |= ARC_FLAG_NO_BUF;
 
 		/* issue the prefetch asynchronously */
 		(void) arc_read(scn->scn_zio_root, spa, bp,
 		    dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
 		    zio_flags, &flags, &spic->spic_zb);
 
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 
 	ASSERT(scn->scn_prefetch_stop);
 
 	/* free any prefetches we didn't get to complete */
 	mutex_enter(&spa->spa_scrub_lock);
 	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
 		avl_remove(&scn->scn_prefetch_queue, spic);
 		scan_prefetch_ctx_rele(spic->spic_spc, scn);
 		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
 	}
 	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	/*
 	 * We never skip over user/group accounting objects (obj<0)
 	 */
 	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
 	    (int64_t)zb->zb_object >= 0) {
 		/*
 		 * If we already visited this bp & everything below (in
 		 * a prior txg sync), don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb,
 		    &scn->scn_phys.scn_bookmark))
 			return (B_TRUE);
 
 		/*
 		 * If we found the block we're trying to resume from, or
 		 * we went past it, zero it out to indicate that it's OK
 		 * to start checking for suspending again.
 		 */
 		if (zbookmark_subtree_tbd(dnp, zb,
 		    &scn->scn_phys.scn_bookmark)) {
 			dprintf("resuming at %llx/%llx/%llx/%llx\n",
 			    (longlong_t)zb->zb_objset,
 			    (longlong_t)zb->zb_object,
 			    (longlong_t)zb->zb_level,
 			    (longlong_t)zb->zb_blkid);
 			memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb));
 		}
 	}
 	return (B_FALSE);
 }
 
 static void dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx);
 inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
     dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
  */
 inline __attribute__((always_inline)) static int
 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
     dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	spa_t *spa = dp->dp_spa;
 	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
 	int err;
 
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * There is an unlikely case of encountering dnodes with contradicting
 	 * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created
 	 * or modified before commit 4254acb was merged. As it is not possible
 	 * to know which of the two is correct, report an error.
 	 */
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
 		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			dsl_scan_visitbp(cbp, &czb, dnp,
 			    ds, scn, ostype, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		dnode_phys_t *cdnp;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 		arc_buf_t *buf;
 
 		if (BP_IS_PROTECTED(bp)) {
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 		for (i = 0, cdnp = buf->b_data; i < epb;
 		    i += cdnp->dn_extra_slots + 1,
 		    cdnp += cdnp->dn_extra_slots + 1) {
 			dsl_scan_visitdnode(scn, ds, ostype,
 			    cdnp, zb->zb_blkid * epb + i, tx);
 		}
 
 		arc_buf_destroy(buf, &buf);
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
 		if (err) {
 			scn->scn_phys.scn_errors++;
 			return (err);
 		}
 
 		osp = buf->b_data;
 
 		dsl_scan_visitdnode(scn, ds, osp->os_type,
 		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
 
 		if (OBJSET_BUF_HAS_USERUSED(buf)) {
 			/*
 			 * We also always visit user/group/project accounting
 			 * objects, and never skip them, even if we are
 			 * suspending. This is necessary so that the
 			 * space deltas from this txg get integrated.
 			 */
 			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
 				dsl_scan_visitdnode(scn, ds, osp->os_type,
 				    &osp->os_projectused_dnode,
 				    DMU_PROJECTUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_groupused_dnode,
 			    DMU_GROUPUSED_OBJECT, tx);
 			dsl_scan_visitdnode(scn, ds, osp->os_type,
 			    &osp->os_userused_dnode,
 			    DMU_USERUSED_OBJECT, tx);
 		}
 		arc_buf_destroy(buf, &buf);
 	} else if (!zfs_blkptr_verify(spa, bp,
 	    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		/*
 		 * Sanity check the block pointer contents, this is handled
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
 		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
 	return (0);
 }
 
 inline __attribute__((always_inline)) static void
 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
     dmu_objset_type_t ostype, dnode_phys_t *dnp,
     uint64_t object, dmu_tx_t *tx)
 {
 	int j;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		zbookmark_phys_t czb;
 
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    dnp->dn_nlevels - 1, j);
 		dsl_scan_visitbp(&dnp->dn_blkptr[j],
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		zbookmark_phys_t czb;
 		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
 		    0, DMU_SPILL_BLKID);
 		dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
 		    &czb, dnp, ds, scn, ostype, tx);
 	}
 }
 
 /*
  * The arguments are in this order because mdb can only print the
  * first 5; we want them to be useful.
  */
 static void
 dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
     dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (dsl_scan_check_suspend(scn, zb))
 		return;
 
 	if (dsl_scan_check_resume(scn, dnp, zb))
 		return;
 
 	scn->scn_visited_this_txg++;
 
 	if (BP_IS_HOLE(bp)) {
 		scn->scn_holes_this_txg++;
 		return;
 	}
 
 	if (BP_IS_REDACTED(bp)) {
 		ASSERT(dsl_dataset_feature_is_active(ds,
 		    SPA_FEATURE_REDACTED_DATASETS));
 		return;
 	}
 
 	/*
 	 * Check if this block contradicts any filesystem flags.
 	 */
 	spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS;
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
 	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
 
 	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp, zb, tx) != 0)
 		return;
 
 	/*
 	 * If dsl_scan_ddt() has already visited this block, it will have
 	 * already done any translations or scrubbing, so don't call the
 	 * callback again.
 	 */
 	if (ddt_class_contains(dp->dp_spa,
 	    scn->scn_phys.scn_ddt_class_max, bp)) {
 		scn->scn_ddt_contained_this_txg++;
 		return;
 	}
 
 	/*
 	 * If this block is from the future (after cur_max_txg), then we
 	 * are doing this on behalf of a deleted snapshot, and we will
 	 * revisit the future block on the next pass of this dataset.
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
 	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
 
 	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
 }
 
 static void
 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
 	zbookmark_phys_t zb;
 	scan_prefetch_ctx_t *spc;
 
 	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
 		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
 		    zb.zb_objset, 0, 0, 0);
 	} else {
 		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
 	}
 
 	scn->scn_objsets_visited_this_txg++;
 
 	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
 	dsl_scan_prefetch(spc, bp, &zb);
 	scan_prefetch_ctx_rele(spc, FTAG);
 
 	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
 
 	dprintf_ds(ds, "finished scan%s", "");
 }
 
 static void
 ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
 {
 	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
 		if (ds->ds_is_snapshot) {
 			/*
 			 * Note:
 			 *  - scn_cur_{min,max}_txg stays the same.
 			 *  - Setting the flag is not really necessary if
 			 *    scn_cur_max_txg == scn_max_txg, because there
 			 *    is nothing after this snapshot that we care
 			 *    about.  However, we set it anyway and then
 			 *    ignore it when we retraverse it in
 			 *    dsl_scan_visitds().
 			 */
 			scn_phys->scn_bookmark.zb_objset =
 			    dsl_dataset_phys(ds)->ds_next_snap_obj;
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset zb_objset to %llu",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
 		} else {
 			SET_BOOKMARK(&scn_phys->scn_bookmark,
 			    ZB_DESTROYED_OBJSET, 0, 0, 0);
 			zfs_dbgmsg("destroying ds %llu on %s; currently "
 			    "traversing; reset bookmark to -1,0,0,0",
 			    (u_longlong_t)ds->ds_object,
 			    ds->ds_dir->dd_pool->dp_spa->spa_name);
 		}
 	}
 }
 
 /*
  * Invoked when a dataset is destroyed. We need to make sure that:
  *
  * 1) If it is the dataset that was currently being scanned, we write
  *	a new dsl_scan_phys_t and marking the objset reference in it
  *	as destroyed.
  * 2) Remove it from the work queue, if it was present.
  *
  * If the dataset was actually a snapshot, instead of marking the dataset
  * as destroyed, we instead substitute the next snapshot in line.
  */
 void
 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_destroyed_scn_phys(ds, &scn->scn_phys);
 	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		if (ds->ds_is_snapshot)
 			scan_ds_queue_insert(scn,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		if (ds->ds_is_snapshot) {
 			/*
 			 * We keep the same mintxg; it could be >
 			 * ds_creation_txg if the previous snapshot was
 			 * deleted too.
 			 */
 			VERIFY(zap_add_int_key(dp->dp_meta_objset,
 			    scn->scn_phys.scn_queue_obj,
 			    dsl_dataset_phys(ds)->ds_next_snap_obj,
 			    mintxg, tx) == 0);
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "replacing with %llu",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name,
 			    (u_longlong_t)dsl_dataset_phys(ds)->
 			    ds_next_snap_obj);
 		} else {
 			zfs_dbgmsg("destroying ds %llu on %s; in queue; "
 			    "removing",
 			    (u_longlong_t)ds->ds_object,
 			    dp->dp_spa->spa_name);
 		}
 	}
 
 	/*
 	 * dsl_scan_sync() should be called after this, and should sync
 	 * out our changed state, but just to be safe, do it here.
 	 */
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds->ds_object) {
 		scn_bookmark->zb_objset =
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds->ds_object,
 		    ds->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 }
 
 /*
  * Called when a dataset is snapshotted. If we were currently traversing
  * this snapshot, we reset our bookmark to point at the newly created
  * snapshot. We also modify our work queue to remove the old snapshot and
  * replace with the new one.
  */
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
 	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
 	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
 
 	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
 		scan_ds_queue_remove(scn, ds->ds_object);
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
 	}
 
 	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
 	    ds->ds_object, &mintxg) == 0) {
 		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
 		VERIFY(zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
 		zfs_dbgmsg("snapshotting ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static void
 ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
     zbookmark_phys_t *scn_bookmark)
 {
 	if (scn_bookmark->zb_objset == ds1->ds_object) {
 		scn_bookmark->zb_objset = ds2->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    ds1->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
 		scn_bookmark->zb_objset = ds1->ds_object;
 		zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; "
 		    "reset zb_objset to %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    ds2->ds_dir->dd_pool->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 }
 
 /*
  * Called when an origin dataset and its clone are swapped.  If we were
  * currently traversing the dataset, we need to switch to traversing the
  * newly promoted clone.
  */
 void
 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	uint64_t mintxg1, mintxg2;
 	boolean_t ds1_queued, ds2_queued;
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
 	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
 
 	/*
 	 * Handle the in-memory scan queue.
 	 */
 	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
 	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * The swapping code below would not handle this case correctly,
 		 * since we can't insert ds2 if it is already there. That's
 		 * because scan_ds_queue_insert() prohibits a duplicate insert
 		 * and panics.
 		 */
 	} else if (ds1_queued) {
 		scan_ds_queue_remove(scn, ds1->ds_object);
 		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
 	} else if (ds2_queued) {
 		scan_ds_queue_remove(scn, ds2->ds_object);
 		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
 	}
 
 	/*
 	 * Handle the on-disk scan queue.
 	 * The on-disk state is an out-of-date version of the in-memory state,
 	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
 	 * be different. Therefore we need to apply the swap logic to the
 	 * on-disk state independently of the in-memory state.
 	 */
 	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
 	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
 	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
 
 	/* Sanity checking. */
 	if (ds1_queued) {
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 	if (ds2_queued) {
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
 		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
 	}
 
 	if (ds1_queued && ds2_queued) {
 		/*
 		 * If both are queued, we don't need to do anything.
 		 * Alternatively, we could check for EEXIST from
 		 * zap_add_int_key() and back out to the original state, but
 		 * that would be more work than checking for this case upfront.
 		 */
 	} else if (ds1_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds1->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds2->ds_object);
 	} else if (ds2_queued) {
 		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
 		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
 		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
 		zfs_dbgmsg("clone_swap ds %llu on %s; in queue; "
 		    "replacing with %llu",
 		    (u_longlong_t)ds2->ds_object,
 		    dp->dp_spa->spa_name,
 		    (u_longlong_t)ds1->ds_object);
 	}
 
 	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	uint64_t originobj = *(uint64_t *)arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
 		return (0);
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 
 		dsl_dataset_rele(ds, FTAG);
 		if (err)
 			return (err);
 		ds = prev;
 	}
 	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 static void
 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 
 	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 
 	if (scn->scn_phys.scn_cur_min_txg >=
 	    scn->scn_phys.scn_max_txg) {
 		/*
 		 * This can happen if this snapshot was created after the
 		 * scan started, and we already completed a previous snapshot
 		 * that was created after the scan started.  This snapshot
 		 * only references blocks with:
 		 *
 		 *	birth < our ds_creation_txg
 		 *	cur_min_txg is no less than ds_creation_txg.
 		 *	We have already visited these blocks.
 		 * or
 		 *	birth > scn_max_txg
 		 *	The scan requested not to visit these blocks.
 		 *
 		 * Subsequent snapshots (and clones) can reference our
 		 * blocks, or blocks with even higher birth times.
 		 * Therefore we do not need to visit them either,
 		 * so we do not add them to the work queue.
 		 *
 		 * Note that checking for cur_min_txg >= cur_max_txg
 		 * is not sufficient, because in that case we may need to
 		 * visit subsequent snapshots.  This happens when min_txg > 0,
 		 * which raises cur_min_txg.  In this case we will visit
 		 * this dataset but skip all of its blocks, because the
 		 * rootbp's birth time is < cur_min_txg.  Then we will
 		 * add the next snapshots/clones to the work queue.
 		 */
 		char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 		dsl_dataset_name(ds, dsname);
 		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
 		    "cur_min_txg (%llu) >= max_txg (%llu)",
 		    (longlong_t)dsobj, dsname,
 		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 		    (longlong_t)scn->scn_phys.scn_max_txg);
 		kmem_free(dsname, MAXNAMELEN);
 
 		goto out;
 	}
 
 	/*
 	 * Only the ZIL in the head (non-snapshot) is valid. Even though
 	 * snapshots can have ZIL block pointers (which may be the same
 	 * BP as in the head), they must be ignored. In addition, $ORIGIN
 	 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
 	 * need to look for a ZIL in it either. So we traverse the ZIL here,
 	 * rather than in scan_recurse(), because the regular snapshot
 	 * block-sharing rules don't apply to it.
 	 */
 	if (!dsl_dataset_is_snapshot(ds) &&
 	    (dp->dp_origin_snap == NULL ||
 	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
 		objset_t *os;
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			goto out;
 		}
 		dsl_scan_zil(dp, &os->os_zil_header);
 	}
 
 	/*
 	 * Iterate over the bps in this ds.
 	 */
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
 	    "suspending=%u",
 	    (longlong_t)dsobj, dsname,
 	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
 	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
 	    (int)scn->scn_suspending);
 	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	if (scn->scn_suspending)
 		goto out;
 
 	/*
 	 * We've finished this pass over this dataset.
 	 */
 
 	/*
 	 * If we did not completely visit this dataset, do another pass.
 	 */
 	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
 		zfs_dbgmsg("incomplete pass on %s; visiting again",
 		    dp->dp_spa->spa_name);
 		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
 		scan_ds_queue_insert(scn, ds->ds_object,
 		    scn->scn_phys.scn_cur_max_txg);
 		goto out;
 	}
 
 	/*
 	 * Add descendant datasets to work queue.
 	 */
 	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
 		scan_ds_queue_insert(scn,
 		    dsl_dataset_phys(ds)->ds_next_snap_obj,
 		    dsl_dataset_phys(ds)->ds_creation_txg);
 	}
 	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
 		boolean_t usenext = B_FALSE;
 		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
 			uint64_t count;
 			/*
 			 * A bug in a previous version of the code could
 			 * cause upgrade_clones_cb() to not set
 			 * ds_next_snap_obj when it should, leading to a
 			 * missing entry.  Therefore we can only use the
 			 * next_clones_obj when its count is correct.
 			 */
 			int err = zap_count(dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
 			if (err == 0 &&
 			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
 				usenext = B_TRUE;
 		}
 
 		if (usenext) {
 			zap_cursor_t zc;
 			zap_attribute_t *za = zap_attribute_alloc();
 			for (zap_cursor_init(&zc, dp->dp_meta_objset,
 			    dsl_dataset_phys(ds)->ds_next_clones_obj);
 			    zap_cursor_retrieve(&zc, za) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				scan_ds_queue_insert(scn,
 				    zfs_strtonum(za->za_name, NULL),
 				    dsl_dataset_phys(ds)->ds_creation_txg);
 			}
 			zap_cursor_fini(&zc);
 			zap_attribute_free(za);
 		} else {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_clones_cb, &ds->ds_object,
 			    DS_FIND_CHILDREN));
 		}
 	}
 
 out:
 	dsl_dataset_rele(ds, FTAG);
 }
 
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	(void) arg;
 	dsl_dataset_t *ds;
 	int err;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
 	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
 		dsl_dataset_t *prev;
 		err = dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
 		/*
 		 * If this is a clone, we don't need to worry about it for now.
 		 */
 		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
 			dsl_dataset_rele(ds, FTAG);
 			dsl_dataset_rele(prev, FTAG);
 			return (0);
 		}
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
 	}
 
 	mutex_enter(&scn->scn_queue_lock);
 	scan_ds_queue_insert(scn, ds->ds_object,
 	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	mutex_exit(&scn->scn_queue_lock);
 	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
     ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	/*
 	 * This function is special because it is the only thing
 	 * that can add scan_io_t's to the vdev scan queues from
 	 * outside dsl_scan_sync(). For the most part this is ok
 	 * as long as it is called from within syncing context.
 	 * However, dsl_scan_sync() expects that no new sio's will
 	 * be added between when all the work for a scan is done
 	 * and the next txg when the scan is actually marked as
 	 * completed. This check ensures we do not issue new sio's
 	 * during this period.
 	 */
 	if (scn->scn_done_txg != 0)
 		return;
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
 
 		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
 		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
 	}
 }
 
 /*
  * Scrub/dedup interaction.
  *
  * If there are N references to a deduped block, we don't want to scrub it
  * N times -- ideally, we should scrub it exactly once.
  *
  * We leverage the fact that the dde's replication class (ddt_class_t)
  * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
  * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
  *
  * To prevent excess scrubbing, the scrub begins by walking the DDT
  * to find all blocks with refcnt > 1, and scrubs each of these once.
  * Since there are two replication classes which contain blocks with
  * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
  * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
  *
  * There would be nothing more to say if a block's refcnt couldn't change
  * during a scrub, but of course it can so we must account for changes
  * in a block's replication class.
  *
  * Here's an example of what can occur:
  *
  * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
  * when visited during the top-down scrub phase, it will be scrubbed twice.
  * This negates our scrub optimization, but is otherwise harmless.
  *
  * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
  * on each visit during the top-down scrub phase, it will never be scrubbed.
  * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
  * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
  * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
  * while a scrub is in progress, it scrubs the block right then.
  */
 static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
 	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;
 
 	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
 			break;
 		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
 		    (longlong_t)ddb->ddb_class,
 		    (longlong_t)ddb->ddb_type,
 		    (longlong_t)ddb->ddb_checksum,
 		    (longlong_t)ddb->ddb_cursor);
 
 		/* There should be no pending changes to the dedup table */
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
 		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))
 			break;
 	}
 
 	if (error == EAGAIN) {
 		dsl_scan_check_suspend(scn, NULL);
 		error = 0;
 
 		zfs_dbgmsg("waiting for ddt to become ready for scan "
 		    "on %s with class_max = %u; suspending=%u",
 		    scn->scn_dp->dp_spa->spa_name,
 		    (int)scn->scn_phys.scn_ddt_class_max,
 		    (int)scn->scn_suspending);
 	} else
 		zfs_dbgmsg("scanned %llu ddt entries on %s with "
 		    "class_max = %u; suspending=%u", (longlong_t)n,
 		    scn->scn_dp->dp_spa->spa_name,
 		    (int)scn->scn_phys.scn_ddt_class_max,
 		    (int)scn->scn_suspending);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
 	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
 static uint64_t
 dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
 	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
 	if (ds->ds_is_snapshot)
 		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
 	return (smt);
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	scan_ds_t *sds;
 	dsl_pool_t *dp = scn->scn_dp;
 
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_ddt(scn, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 
 		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
 		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
 		dsl_scan_visit_rootbp(scn, NULL,
 		    &dp->dp_meta_rootbp, tx);
 		if (scn->scn_suspending)
 			return;
 
 		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
 			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 			    enqueue_cb, NULL, DS_FIND_CHILDREN));
 		} else {
 			dsl_scan_visitds(scn,
 			    dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!scn->scn_suspending);
 	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
 	    ZB_DESTROYED_OBJSET) {
 		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
 		/*
 		 * If we were suspended, continue from here. Note if the
 		 * ds we were suspended on was deleted, the zb_objset may
 		 * be -1, so we will skip this and find a new objset
 		 * below.
 		 */
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/*
 	 * In case we suspended right at the end of the ds, zero the
 	 * bookmark so we don't think that we're still trying to resume.
 	 */
 	memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t));
 
 	/*
 	 * Keep pulling things out of the dataset avl queue. Updates to the
 	 * persistent zap-object-as-queue happen only at checkpoints.
 	 */
 	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
 		dsl_dataset_t *ds;
 		uint64_t dsobj = sds->sds_dsobj;
 		uint64_t txg = sds->sds_txg;
 
 		/* dequeue and free the ds from the queue */
 		scan_ds_queue_remove(scn, dsobj);
 		sds = NULL;
 
 		/* set up min / max txg */
 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
 		if (txg != 0) {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg, txg);
 		} else {
 			scn->scn_phys.scn_cur_min_txg =
 			    MAX(scn->scn_phys.scn_min_txg,
 			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 		}
 		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
 		dsl_dataset_rele(ds, FTAG);
 
 		dsl_scan_visitds(scn, dsobj, tx);
 		if (scn->scn_suspending)
 			return;
 	}
 
 	/* No more objsets to fetch, we're done */
 	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
 	ASSERT0(scn->scn_suspending);
 }
 
 static uint64_t
 dsl_scan_count_data_disks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t i, leaves = 0;
 
 	for (i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
 			continue;
 		leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
 	}
 	return (leaves);
 }
 
 static void
 scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
 {
 	int i;
 	uint64_t cur_size = 0;
 
 	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
 		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
 	}
 
 	q->q_total_zio_size_this_txg += cur_size;
 	q->q_zios_this_txg++;
 }
 
 static void
 scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
     uint64_t end)
 {
 	q->q_total_seg_size_this_txg += end - start;
 	q->q_segs_this_txg++;
 }
 
 static boolean_t
 scan_io_queue_check_suspend(dsl_scan_t *scn)
 {
 	/* See comment in dsl_scan_check_suspend() */
 	uint64_t curr_time_ns = gethrtime();
 	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
 	uint64_t sync_time_ns = curr_time_ns -
 	    scn->scn_dp->dp_spa->spa_sync_starttime;
 	uint64_t dirty_min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
 	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
 
 	return ((NSEC2MSEC(scan_time_ns) > mintime &&
 	    (scn->scn_dp->dp_dirty_total >= dirty_min_bytes ||
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 /*
  * Given a list of scan_io_t's in io_list, this issues the I/Os out to
  * disk. This consumes the io_list and frees the scan_io_t's. This is
  * called when emptying queues, either when we're up against the memory
  * limit or when we have finished scanning. Returns B_TRUE if we stopped
  * processing the list before we finished. Any sios that were not issued
  * will remain in the io_list.
  */
 static boolean_t
 scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	boolean_t suspended = B_FALSE;
 
 	while ((sio = list_head(io_list)) != NULL) {
 		blkptr_t bp;
 
 		if (scan_io_queue_check_suspend(scn)) {
 			suspended = B_TRUE;
 			break;
 		}
 
 		sio2bp(sio, &bp);
 		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
 		    &sio->sio_zb, queue);
 		(void) list_remove_head(io_list);
 		scan_io_queues_update_zio_stats(queue, &bp);
 		sio_free(sio);
 	}
 	return (suspended);
 }
 
 /*
  * This function removes sios from an IO queue which reside within a given
  * range_seg_t and inserts them (in offset order) into a list. Note that
  * we only ever return a maximum of 32 sios at once. If there are more sios
  * to process within this segment that did not make it onto the list we
  * return B_TRUE and otherwise B_FALSE.
  */
 static boolean_t
 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
 {
 	scan_io_t *srch_sio, *sio, *next_sio;
 	avl_index_t idx;
 	uint_t num_sios = 0;
 	int64_t bytes_issued = 0;
 
 	ASSERT(rs != NULL);
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	srch_sio = sio_alloc(1);
 	srch_sio->sio_nr_dvas = 1;
 	SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
 
 	/*
 	 * The exact start of the extent might not contain any matching zios,
 	 * so if that's the case, examine the next one in the tree.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio == NULL)
 		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
 
 	while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
 	    queue->q_exts_by_addr) && num_sios <= 32) {
 		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
 		    queue->q_exts_by_addr));
 		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
 		    queue->q_exts_by_addr));
 
 		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		bytes_issued += SIO_GET_ASIZE(sio);
 		num_sios++;
 		list_insert_tail(list, sio);
 		sio = next_sio;
 	}
 
 	/*
 	 * We limit the number of sios we process at once to 32 to avoid
 	 * biting off more than we can chew. If we didn't take everything
 	 * in the segment we update it to reflect the work we were able to
 	 * complete. Otherwise, we remove it from the range tree entirely.
 	 */
 	if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
 	    queue->q_exts_by_addr)) {
 		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
 		    -bytes_issued);
 		range_tree_resize_segment(queue->q_exts_by_addr, rs,
 		    SIO_GET_OFFSET(sio), rs_get_end(rs,
 		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
 		queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
 		return (B_TRUE);
 	} else {
 		uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
 		uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
 		range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
 		queue->q_last_ext_addr = -1;
 		return (B_FALSE);
 	}
 }
 
 /*
  * This is called from the queue emptying thread and selects the next
  * extent from which we are to issue I/Os. The behavior of this function
  * depends on the state of the scan, the current memory consumption and
  * whether or not we are performing a scan shutdown.
  * 1) We select extents in an elevator algorithm (LBA-order) if the scan
  * 	needs to perform a checkpoint
  * 2) We select the largest available extent if we are up against the
  * 	memory limit.
  * 3) Otherwise we don't select any extents.
  */
 static range_seg_t *
 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	range_tree_t *rt = queue->q_exts_by_addr;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 	ASSERT(scn->scn_is_sorted);
 
 	if (!scn->scn_checkpointing && !scn->scn_clearing)
 		return (NULL);
 
 	/*
 	 * During normal clearing, we want to issue our largest segments
 	 * first, keeping IO as sequential as possible, and leaving the
 	 * smaller extents for later with the hope that they might eventually
 	 * grow to larger sequential segments. However, when the scan is
 	 * checkpointing, no new extents will be added to the sorting queue,
 	 * so the way we are sorted now is as good as it will ever get.
 	 * In this case, we instead switch to issuing extents in LBA order.
 	 */
 	if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
 	    zfs_scan_issue_strategy == 1)
 		return (range_tree_first(rt));
 
 	/*
 	 * Try to continue previous extent if it is not completed yet.  After
 	 * shrink in scan_io_queue_gather() it may no longer be the best, but
 	 * otherwise we leave shorter remnant every txg.
 	 */
 	uint64_t start;
 	uint64_t size = 1ULL << rt->rt_shift;
 	range_seg_t *addr_rs;
 	if (queue->q_last_ext_addr != -1) {
 		start = queue->q_last_ext_addr;
 		addr_rs = range_tree_find(rt, start, size);
 		if (addr_rs != NULL)
 			return (addr_rs);
 	}
 
 	/*
 	 * Nothing to continue, so find new best extent.
 	 */
 	uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
 	if (v == NULL)
 		return (NULL);
 	queue->q_last_ext_addr = start = *v << rt->rt_shift;
 
 	/*
 	 * We need to get the original entry in the by_addr tree so we can
 	 * modify it.
 	 */
 	addr_rs = range_tree_find(rt, start, size);
 	ASSERT3P(addr_rs, !=, NULL);
 	ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
 	ASSERT3U(rs_get_end(addr_rs, rt), >, start);
 	return (addr_rs);
 }
 
 static void
 scan_io_queues_run_one(void *arg)
 {
 	dsl_scan_io_queue_t *queue = arg;
 	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 	boolean_t suspended = B_FALSE;
 	range_seg_t *rs;
 	scan_io_t *sio;
 	zio_t *zio;
 	list_t sio_list;
 
 	ASSERT(queue->q_scn->scn_is_sorted);
 
 	list_create(&sio_list, sizeof (scan_io_t),
 	    offsetof(scan_io_t, sio_nodes.sio_list_node));
 	zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa,
 	    NULL, NULL, NULL, ZIO_FLAG_CANFAIL);
 	mutex_enter(q_lock);
 	queue->q_zio = zio;
 
 	/* Calculate maximum in-flight bytes for this vdev. */
 	queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
 	    (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
 
 	/* reset per-queue scan statistics for this txg */
 	queue->q_total_seg_size_this_txg = 0;
 	queue->q_segs_this_txg = 0;
 	queue->q_total_zio_size_this_txg = 0;
 	queue->q_zios_this_txg = 0;
 
 	/* loop until we run out of time or sios */
 	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
 		uint64_t seg_start = 0, seg_end = 0;
 		boolean_t more_left;
 
 		ASSERT(list_is_empty(&sio_list));
 
 		/* loop while we still have sios left to process in this rs */
 		do {
 			scan_io_t *first_sio, *last_sio;
 
 			/*
 			 * We have selected which extent needs to be
 			 * processed next. Gather up the corresponding sios.
 			 */
 			more_left = scan_io_queue_gather(queue, rs, &sio_list);
 			ASSERT(!list_is_empty(&sio_list));
 			first_sio = list_head(&sio_list);
 			last_sio = list_tail(&sio_list);
 
 			seg_end = SIO_GET_END_OFFSET(last_sio);
 			if (seg_start == 0)
 				seg_start = SIO_GET_OFFSET(first_sio);
 
 			/*
 			 * Issuing sios can take a long time so drop the
 			 * queue lock. The sio queue won't be updated by
 			 * other threads since we're in syncing context so
 			 * we can be sure that our trees will remain exactly
 			 * as we left them.
 			 */
 			mutex_exit(q_lock);
 			suspended = scan_io_queue_issue(queue, &sio_list);
 			mutex_enter(q_lock);
 
 			if (suspended)
 				break;
 		} while (more_left);
 
 		/* update statistics for debugging purposes */
 		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
 
 		if (suspended)
 			break;
 	}
 
 	/*
 	 * If we were suspended in the middle of processing,
 	 * requeue any unfinished sios and exit.
 	 */
 	while ((sio = list_remove_head(&sio_list)) != NULL)
 		scan_io_queue_insert_impl(queue, sio);
 
 	queue->q_zio = NULL;
 	mutex_exit(q_lock);
 	zio_nowait(zio);
 	list_destroy(&sio_list);
 }
 
 /*
  * Performs an emptying run on all scan queues in the pool. This just
  * punches out one thread per top-level vdev, each of which processes
  * only that vdev's scan queue. We can parallelize the I/O here because
  * we know that each queue's I/Os only affect its own top-level vdev.
  *
  * This function waits for the queue runs to complete, and must be
  * called from dsl_scan_sync (or in general, syncing context).
  */
 static void
 scan_io_queues_run(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 
 	ASSERT(scn->scn_is_sorted);
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (scn->scn_queues_pending == 0)
 		return;
 
 	if (scn->scn_taskq == NULL) {
 		int nthreads = spa->spa_root_vdev->vdev_children;
 
 		/*
 		 * We need to make this taskq *always* execute as many
 		 * threads in parallel as we have top-level vdevs and no
 		 * less, otherwise strange serialization of the calls to
 		 * scan_io_queues_run_one can occur during spa_sync runs
 		 * and that significantly impacts performance.
 		 */
 		scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
 		    minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		if (vd->vdev_scan_io_queue != NULL) {
 			VERIFY(taskq_dispatch(scn->scn_taskq,
 			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * Wait for the queues to finish issuing their IOs for this run
 	 * before we return. There may still be IOs in flight at this
 	 * point.
 	 */
 	taskq_wait(scn->scn_taskq);
 }
 
 static boolean_t
 dsl_scan_async_block_should_pause(dsl_scan_t *scn)
 {
 	uint64_t elapsed_nanosecs;
 
 	if (zfs_recover)
 		return (B_FALSE);
 
 	if (zfs_async_block_max_blocks != 0 &&
 	    scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
 		return (B_TRUE);
 	}
 
 	if (zfs_max_async_dedup_frees != 0 &&
 	    scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
 		return (B_TRUE);
 	}
 
 	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
 	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
 	    (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
 	    txg_sync_waiting(scn->scn_dp)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa));
 }
 
 static int
 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = arg;
 
 	if (!scn->scn_is_bptree ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
 		if (dsl_scan_async_block_should_pause(scn))
 			return (SET_ERROR(ERESTART));
 	}
 
 	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
 	    dmu_tx_get_txg(tx), bp, 0));
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	scn->scn_visited_this_txg++;
 	if (BP_GET_DEDUP(bp))
 		scn->scn_dedup_frees_this_txg++;
 	return (0);
 }
 
 static void
 dsl_scan_update_stats(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t i;
 	uint64_t seg_size_total = 0, zio_size_total = 0;
 	uint64_t seg_count_total = 0, zio_count_total = 0;
 
 	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
 
 		if (queue == NULL)
 			continue;
 
 		seg_size_total += queue->q_total_seg_size_this_txg;
 		zio_size_total += queue->q_total_zio_size_this_txg;
 		seg_count_total += queue->q_segs_this_txg;
 		zio_count_total += queue->q_zios_this_txg;
 	}
 
 	if (seg_count_total == 0 || zio_count_total == 0) {
 		scn->scn_avg_seg_size_this_txg = 0;
 		scn->scn_avg_zio_size_this_txg = 0;
 		scn->scn_segs_this_txg = 0;
 		scn->scn_zios_this_txg = 0;
 		return;
 	}
 
 	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
 	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
 	scn->scn_segs_this_txg = seg_count_total;
 	scn->scn_zios_this_txg = zio_count_total;
 }
 
 static int
 bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (dsl_scan_free_block_cb(arg, bp, tx));
 }
 
 static int
 dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	dsl_scan_t *scn = arg;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	if (dsl_scan_async_block_should_pause(scn))
 		return (SET_ERROR(ERESTART));
 
 	spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
 	    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
 	    DVA_GET_ASIZE(dva), tx);
 	scn->scn_visited_this_txg++;
 	return (0);
 }
 
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	uint64_t used = 0, comp, uncomp;
 	boolean_t clones_left;
 
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
 	    (scn->scn_async_destroying && !scn->scn_async_stalled))
 		return (B_TRUE);
 
 	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
 		    &used, &comp, &uncomp);
 	}
 	clones_left = spa_livelist_delete_check(spa);
 	return ((used != 0) || (clones_left));
 }
 
 boolean_t
 dsl_errorscrub_active(dsl_scan_t *scn)
 {
 	spa_t *spa = scn->scn_dp->dp_spa;
 	if (spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 	if (dsl_errorscrubbing(scn->scn_dp))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
 	boolean_t need_resilver = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		need_resilver |=
 		    dsl_scan_check_deferred(vd->vdev_child[c]);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (need_resilver);
 
 	if (!vd->vdev_resilver_deferred)
 		need_resilver = B_TRUE;
 
 	return (need_resilver);
 }
 
 static boolean_t
 dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_t *vd;
 
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		/*
 		 * The indirect vdev can point to multiple
 		 * vdevs.  For simplicity, always create
 		 * the resilver zio_t. zio_vdev_io_start()
 		 * will bypass the child resilver i/o's if
 		 * they are on vdevs that don't have DTL's.
 		 */
 		return (B_TRUE);
 	}
 
 	if (DVA_GET_GANG(dva)) {
 		/*
 		 * Gang members may be spread across multiple
 		 * vdevs, so the best estimate we have is the
 		 * scrub range, which has already been checked.
 		 * XXX -- it would be better to change our
 		 * allocation policy to ensure that all
 		 * gang members reside on the same vdev.
 		 */
 		return (B_TRUE);
 	}
 
 	/*
 	 * Check if the top-level vdev must resilver this offset.
 	 * When the offset does not intersect with a dirty leaf DTL
 	 * then it may be possible to skip the resilver IO.  The psize
 	 * is provided instead of asize to simplify the check for RAIDZ.
 	 */
 	if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
 		return (B_FALSE);
 
 	/*
 	 * Check that this top-level vdev has a device under it which
 	 * is resilvering and is not deferred.
 	 */
 	if (!dsl_scan_check_deferred(vd))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static int
 dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	int err = 0;
 
 	if (spa_suspend_async_destroy(spa))
 		return (0);
 
 	if (zfs_free_bpobj_enabled &&
 	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bpobj_iterate(&dp->dp_free_bpobj,
 		    bpobj_dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 	}
 
 	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		ASSERT(scn->scn_async_destroying);
 		scn->scn_is_bptree = B_TRUE;
 		scn->scn_zio_root = zio_root(spa, NULL,
 		    NULL, ZIO_FLAG_MUSTSUCCEED);
 		err = bptree_iterate(dp->dp_meta_objset,
 		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
 		VERIFY0(zio_wait(scn->scn_zio_root));
 		scn->scn_zio_root = NULL;
 
 		if (err == EIO || err == ECKSUM) {
 			err = 0;
 		} else if (err != 0 && err != ERESTART) {
 			zfs_panic_recover("error %u from "
 			    "traverse_dataset_destroyed()", err);
 		}
 
 		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
 			/* finished; deactivate async destroy feature */
 			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
 			ASSERT(!spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY));
 			VERIFY0(zap_remove(dp->dp_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_BPTREE_OBJ, tx));
 			VERIFY0(bptree_free(dp->dp_meta_objset,
 			    dp->dp_bptree_obj, tx));
 			dp->dp_bptree_obj = 0;
 			scn->scn_async_destroying = B_FALSE;
 			scn->scn_async_stalled = B_FALSE;
 		} else {
 			/*
 			 * If we didn't make progress, mark the async
 			 * destroy as stalled, so that we will not initiate
 			 * a spa_sync() on its behalf.  Note that we only
 			 * check this if we are not finished, because if the
 			 * bptree had no blocks for us to visit, we can
 			 * finish without "making progress".
 			 */
 			scn->scn_async_stalled =
 			    (scn->scn_visited_this_txg == 0);
 		}
 	}
 	if (scn->scn_visited_this_txg) {
 		zfs_dbgmsg("freed %llu blocks in %llums from "
 		    "free_bpobj/bptree on %s in txg %llu; err=%u",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    (longlong_t)
 		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
 		    spa->spa_name, (longlong_t)tx->tx_txg, err);
 		scn->scn_visited_this_txg = 0;
 		scn->scn_dedup_frees_this_txg = 0;
 
 		/*
 		 * Write out changes to the DDT and the BRT that may be required
 		 * as a result of the blocks freed.  This ensures that the DDT
 		 * and the BRT are clean when a scrub/resilver runs.
 		 */
 		ddt_sync(spa, tx->tx_txg);
 		brt_sync(spa, tx->tx_txg);
 	}
 	if (err != 0)
 		return (err);
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    zfs_free_leak_on_eio &&
 	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
 	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
 		/*
 		 * We have finished background destroying, but there is still
 		 * some space left in the dp_free_dir. Transfer this leaked
 		 * space to the dp_leak_dir.
 		 */
 		if (dp->dp_leak_dir == NULL) {
 			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
 			    LEAK_DIR_NAME, tx);
 			VERIFY0(dsl_pool_open_special_dir(dp,
 			    LEAK_DIR_NAME, &dp->dp_leak_dir));
 			rrw_exit(&dp->dp_config_rwlock, FTAG);
 		}
 		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
 		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
 	}
 
 	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
 	    !spa_livelist_delete_check(spa)) {
 		/* finished; verify that space accounting went to zero */
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
 		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
 	}
 
 	spa_notify_waiters(spa);
 
 	EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
 	    0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ));
 	if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_OBSOLETE_COUNTS));
 
 		scn->scn_is_bptree = B_FALSE;
 		scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
 		err = bpobj_iterate(&dp->dp_obsolete_bpobj,
 		    dsl_scan_obsolete_block_cb, scn, tx);
 		if (err != 0 && err != ERESTART)
 			zfs_panic_recover("error %u from bpobj_iterate()", err);
 
 		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
 			dsl_pool_destroy_obsolete_bpobj(dp, tx);
 	}
 	return (0);
 }
 
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 name_to_object(char *buf, uint64_t *obj)
 {
 	*obj = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == '\0');
 }
 
 static void
 read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
 {
 	dsl_pool_t *dp = scn->scn_dp;
 	dsl_dataset_t *ds;
 	objset_t *os;
 	if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
 		return;
 
 	if (dmu_objset_from_ds(ds, &os) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	/*
 	 * If the key is not loaded dbuf_dnode_findbp() will error out with
 	 * EACCES. However in that case dnode_hold() will eventually call
 	 * dbuf_read()->zio_wait() which may call spa_log_error(). This will
 	 * lead to a deadlock due to us holding the mutex spa_errlist_lock.
 	 * Avoid this by checking here if the keys are loaded, if not return.
 	 * If the keys are not loaded the head_errlog feature is meaningless
 	 * as we cannot figure out the birth txg of the block pointer.
 	 */
 	if (dsl_dataset_get_keystatus(ds->ds_dir) ==
 	    ZFS_KEYSTATUS_UNAVAILABLE) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	dnode_t *dn;
 	blkptr_t bp;
 
 	if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
 	    NULL);
 
 	if (error) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	if (!error && BP_IS_HOLE(&bp)) {
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele(ds, FTAG);
 		return;
 	}
 
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb.zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(!BP_IS_EMBEDDED(&bp));
 	scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 }
 
 /*
  * We keep track of the scrubbed error blocks in "count". This will be used
  * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
  * function is modelled after check_filesystem().
  */
 static int
 scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
     int *count)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t latest_txg;
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 
 	error = find_birth_txg(ds, zep, &latest_txg);
 
 	/*
 	 * If find_birth_txg() errors out, then err on the side of caution and
 	 * proceed. In worst case scenario scrub all objects. If zep->zb_birth
 	 * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
 	 * scrub all objects.
 	 */
 	if (error == 0 && zep->zb_birth == latest_txg) {
 		/* Block neither free nor re written. */
 		zbookmark_phys_t zb;
 		zep_to_zb(fs, zep, &zb);
 		scn->scn_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 		/* We have already acquired the config lock for spa */
 		read_by_block_level(scn, zb);
 
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		scn->errorscrub_phys.dep_examined++;
 		scn->errorscrub_phys.dep_to_examine--;
 		(*count)++;
 		if ((*count) == zfs_scrub_error_blocks_per_txg ||
 		    dsl_error_scrub_check_suspend(scn, &zb)) {
 			dsl_dataset_rele(ds, FTAG);
 			return (SET_ERROR(EFAULT));
 		}
 
 		check_snapshot = B_FALSE;
 	} else if (error == 0) {
 		txg_to_consider = latest_txg;
 	}
 
 	/*
 	 * Retrieve the number of snapshots if the dataset is not a snapshot.
 	 */
 	uint64_t snap_count = 0;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 
 		error = zap_count(spa->spa_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 
 		if (error != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			return (error);
 		}
 	}
 
 	if (snap_count == 0) {
 		/* Filesystem without snapshots. */
 		dsl_dataset_rele(ds, FTAG);
 		return (0);
 	}
 
 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	dsl_dataset_rele(ds, FTAG);
 
 	/* Check only snapshots created from this file system. */
 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
 	    snap_obj_txg <= txg_to_consider) {
 
 		error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
 		if (error != 0)
 			return (error);
 
 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
 			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			dsl_dataset_rele(ds, FTAG);
 			continue;
 		}
 
 		boolean_t affected = B_TRUE;
 		if (check_snapshot) {
 			uint64_t blk_txg;
 			error = find_birth_txg(ds, zep, &blk_txg);
 
 			/*
 			 * Scrub the snapshot also when zb_birth == 0 or when
 			 * find_birth_txg() returns an error.
 			 */
 			affected = (error == 0 && zep->zb_birth == blk_txg) ||
 			    (error != 0) || (zep->zb_birth == 0);
 		}
 
 		/* Scrub snapshots. */
 		if (affected) {
 			zbookmark_phys_t zb;
 			zep_to_zb(snap_obj, zep, &zb);
 			scn->scn_zio_root = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL);
 			/* We have already acquired the config lock for spa */
 			read_by_block_level(scn, zb);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined++;
 			scn->errorscrub_phys.dep_to_examine--;
 			(*count)++;
 			if ((*count) == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, &zb)) {
 				dsl_dataset_rele(ds, FTAG);
 				return (EFAULT);
 			}
 		}
 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		dsl_dataset_rele(ds, FTAG);
 	}
 	return (0);
 }
 
 void
 dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
 		return;
 	}
 
 	if (dsl_scan_resilvering(scn->scn_dp)) {
 		/* cancel the error scrub if resilver started */
 		dsl_scan_cancel(scn->scn_dp);
 		return;
 	}
 
 	spa->spa_scrub_active = B_TRUE;
 	scn->scn_sync_start_time = gethrtime();
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scrub progress.
 	 * See more detailed comment in dsl_scan_sync().
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		int mintime = zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	int i = 0;
 	zap_attribute_t *za;
 	zbookmark_phys_t *zb;
 	boolean_t limit_exceeded = B_FALSE;
 
 	za = zap_attribute_alloc();
 	zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 		    zap_cursor_advance(&scn->errorscrub_cursor)) {
 			name_to_bookmark(za->za_name, zb);
 
 			scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 			    NULL, ZIO_FLAG_CANFAIL);
 			dsl_pool_config_enter(dp, FTAG);
 			read_by_block_level(scn, *zb);
 			dsl_pool_config_exit(dp, FTAG);
 
 			(void) zio_wait(scn->scn_zio_root);
 			scn->scn_zio_root = NULL;
 
 			scn->errorscrub_phys.dep_examined += 1;
 			scn->errorscrub_phys.dep_to_examine -= 1;
 			i++;
 			if (i == zfs_scrub_error_blocks_per_txg ||
 			    dsl_error_scrub_check_suspend(scn, zb)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		if (!limit_exceeded)
 			dsl_errorscrub_done(scn, B_TRUE, tx);
 
 		dsl_errorscrub_sync_state(scn, tx);
 		zap_attribute_free(za);
 		kmem_free(zb, sizeof (*zb));
 		return;
 	}
 
 	int error = 0;
 	for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
 	    zap_cursor_advance(&scn->errorscrub_cursor)) {
 
 		zap_cursor_t *head_ds_cursor;
 		zap_attribute_t *head_ds_attr;
 		zbookmark_err_phys_t head_ds_block;
 
 		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 		head_ds_attr = zap_attribute_alloc();
 
 		uint64_t head_ds_err_obj = za->za_first_integer;
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		boolean_t config_held = B_FALSE;
 		uint64_t top_affected_fs;
 
 		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
 		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
 		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
 
 			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
 
 			/*
 			 * In case we are called from spa_sync the pool
 			 * config is already held.
 			 */
 			if (!dsl_pool_config_held(dp)) {
 				dsl_pool_config_enter(dp, FTAG);
 				config_held = B_TRUE;
 			}
 
 			error = find_top_affected_fs(spa,
 			    head_ds, &head_ds_block, &top_affected_fs);
 			if (error)
 				break;
 
 			error = scrub_filesystem(spa, top_affected_fs,
 			    &head_ds_block, &i);
 
 			if (error == SET_ERROR(EFAULT)) {
 				limit_exceeded = B_TRUE;
 				break;
 			}
 		}
 
 		zap_cursor_fini(head_ds_cursor);
 		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
 		zap_attribute_free(head_ds_attr);
 
 		if (config_held)
 			dsl_pool_config_exit(dp, FTAG);
 	}
 
 	zap_attribute_free(za);
 	kmem_free(zb, sizeof (*zb));
 	if (!limit_exceeded)
 		dsl_errorscrub_done(scn, B_TRUE, tx);
 
 	dsl_errorscrub_sync_state(scn, tx);
 }
 
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
  * can guarantee that blocks we are currently scanning will not change out
  * from under us. While a scan is active, this function controls how quickly
  * transaction groups proceed, instead of the normal handling provided by
  * txg_sync_thread().
  */
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	int err = 0;
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	state_sync_type_t sync_type = SYNC_OPTIONAL;
 	int restart_early = 0;
 
 	if (spa->spa_resilver_deferred) {
 		uint64_t to_issue, issued;
 
 		if (!spa_feature_is_active(dp->dp_spa,
 		    SPA_FEATURE_RESILVER_DEFER))
 			spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 
 		/*
 		 * See print_scan_scrub_resilver_status() issued/total_i
 		 * @ cmd/zpool/zpool_main.c
 		 */
 		to_issue =
 		    scn->scn_phys.scn_to_examine - scn->scn_phys.scn_skipped;
 		issued =
 		    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 		restart_early =
 		    zfs_resilver_disable_defer ||
 		    (issued < (to_issue * zfs_resilver_defer_percent / 100));
 	}
 
 	/*
 	 * Only process scans in sync pass 1.
 	 */
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
 	 * imported (see dsl_scan_init). We also restart scans if there
 	 * is a deferred resilver and the user has manually disabled
 	 * deferred resilvers via zfs_resilver_disable_defer, or if the
 	 * current scan progress is below zfs_resilver_defer_percent.
 	 */
 	if (dsl_scan_restarting(scn, tx) || restart_early) {
 		setup_sync_arg_t setup_sync_arg = {
 			.func = POOL_SCAN_SCRUB,
 			.txgstart = 0,
 			.txgend = 0,
 		};
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
 			setup_sync_arg.func = POOL_SCAN_RESILVER;
 		zfs_dbgmsg("restarting scan func=%u on %s txg=%llu early=%d",
 		    setup_sync_arg.func, dp->dp_spa->spa_name,
 		    (longlong_t)tx->tx_txg, restart_early);
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 
 	/*
 	 * If the spa is shutting down, then stop scanning. This will
 	 * ensure that the scan does not dirty any new data during the
 	 * shutdown phase.
 	 */
 	if (spa_shutting_down(spa))
 		return;
 
 	/*
 	 * If the scan is inactive due to a stalled async destroy, try again.
 	 */
 	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
 		return;
 
 	/* reset scan statistics */
 	scn->scn_visited_this_txg = 0;
 	scn->scn_dedup_frees_this_txg = 0;
 	scn->scn_holes_this_txg = 0;
 	scn->scn_lt_min_this_txg = 0;
 	scn->scn_gt_max_this_txg = 0;
 	scn->scn_ddt_contained_this_txg = 0;
 	scn->scn_objsets_visited_this_txg = 0;
 	scn->scn_avg_seg_size_this_txg = 0;
 	scn->scn_segs_this_txg = 0;
 	scn->scn_avg_zio_size_this_txg = 0;
 	scn->scn_zios_this_txg = 0;
 	scn->scn_suspending = B_FALSE;
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
 	/*
 	 * First process the async destroys.  If we suspend, don't do
 	 * any scrubbing or resilvering.  This ensures that there are no
 	 * async destroys while we are scanning, so the scan code doesn't
 	 * have to worry about traversing it.  It is also faster to free the
 	 * blocks than to scrub them.
 	 */
 	err = dsl_process_async_destroys(dp, tx);
 	if (err != 0)
 		return;
 
 	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
 		return;
 
 	/*
 	 * Wait a few txgs after importing to begin scanning so that
 	 * we can get the pool imported quickly.
 	 */
 	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
 		return;
 
 	/*
 	 * zfs_scan_suspend_progress can be set to disable scan progress.
 	 * We don't want to spin the txg_sync thread, so we add a delay
 	 * here to simulate the time spent doing a scan. This is mostly
 	 * useful for testing and debugging.
 	 */
 	if (zfs_scan_suspend_progress) {
 		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		uint_t mintime = (scn->scn_phys.scn_func ==
 		    POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms :
 		    zfs_scrub_min_time_ms;
 
 		while (zfs_scan_suspend_progress &&
 		    !txg_sync_waiting(scn->scn_dp) &&
 		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
 		    NSEC2MSEC(scan_time_ns) < mintime) {
 			delay(hz);
 			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
 		}
 		return;
 	}
 
 	/*
 	 * Disabled by default, set zfs_scan_report_txgs to report
 	 * average performance over the last zfs_scan_report_txgs TXGs.
 	 */
 	if (zfs_scan_report_txgs != 0 &&
 	    tx->tx_txg % zfs_scan_report_txgs == 0) {
 		scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
 		spa_scan_stat_init(spa);
 	}
 
 	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
 	 * a checkpoint after a reboot.
 	 */
 	if (!zfs_scan_legacy) {
 		scn->scn_is_sorted = B_TRUE;
 		if (scn->scn_last_checkpoint == 0)
 			scn->scn_last_checkpoint = ddi_get_lbolt();
 	}
 
 	/*
 	 * For sorted scans, determine what kind of work we will be doing
 	 * this txg based on our memory limitations and whether or not we
 	 * need to perform a checkpoint.
 	 */
 	if (scn->scn_is_sorted) {
 		/*
 		 * If we are over our checkpoint interval, set scn_clearing
 		 * so that we can begin checkpointing immediately. The
 		 * checkpoint allows us to save a consistent bookmark
 		 * representing how much data we have scrubbed so far.
 		 * Otherwise, use the memory limit to determine if we should
 		 * scan for metadata or start issue scrub IOs. We accumulate
 		 * metadata until we hit our hard memory limit at which point
 		 * we issue scrub IOs until we are at our soft memory limit.
 		 */
 		if (scn->scn_checkpointing ||
 		    ddi_get_lbolt() - scn->scn_last_checkpoint >
 		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
 			if (!scn->scn_checkpointing)
 				zfs_dbgmsg("begin scan checkpoint for %s",
 				    spa->spa_name);
 
 			scn->scn_checkpointing = B_TRUE;
 			scn->scn_clearing = B_TRUE;
 		} else {
 			boolean_t should_clear = dsl_scan_should_clear(scn);
 			if (should_clear && !scn->scn_clearing) {
 				zfs_dbgmsg("begin scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_TRUE;
 			} else if (!should_clear && scn->scn_clearing) {
 				zfs_dbgmsg("finish scan clearing for %s",
 				    spa->spa_name);
 				scn->scn_clearing = B_FALSE;
 			}
 		}
 	} else {
 		ASSERT0(scn->scn_checkpointing);
 		ASSERT0(scn->scn_clearing);
 	}
 
 	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
 		/* Need to scan metadata for more blocks to scrub */
 		dsl_scan_phys_t *scnp = &scn->scn_phys;
 		taskqid_t prefetch_tqid;
 
 		/*
 		 * Calculate the max number of in-flight bytes for pool-wide
 		 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max).
 		 * Limits for the issuing phase are done per top-level vdev and
 		 * are handled separately.
 		 */
 		scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20,
 		    zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa)));
 
 		if (scnp->scn_ddt_bookmark.ddb_class <=
 		    scnp->scn_ddt_class_max) {
 			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "ddt bm=%llu/%llu/%llu/%llx",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
 			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
 		} else {
 			zfs_dbgmsg("doing scan sync for %s txg %llu; "
 			    "bm=%llu/%llu/%llu/%llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg,
 			    (longlong_t)scnp->scn_bookmark.zb_objset,
 			    (longlong_t)scnp->scn_bookmark.zb_object,
 			    (longlong_t)scnp->scn_bookmark.zb_level,
 			    (longlong_t)scnp->scn_bookmark.zb_blkid);
 		}
 
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 
 		scn->scn_prefetch_stop = B_FALSE;
 		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
 		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
 		ASSERT(prefetch_tqid != TASKQID_INVALID);
 
 		dsl_pool_config_enter(dp, FTAG);
 		dsl_scan_visit(scn, tx);
 		dsl_pool_config_exit(dp, FTAG);
 
 		mutex_enter(&dp->dp_spa->spa_scrub_lock);
 		scn->scn_prefetch_stop = B_TRUE;
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&dp->dp_spa->spa_scrub_lock);
 
 		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		zfs_dbgmsg("scan visited %llu blocks of %s in %llums "
 		    "(%llu os's, %llu holes, %llu < mintxg, "
 		    "%llu in ddt, %llu > maxtxg)",
 		    (longlong_t)scn->scn_visited_this_txg,
 		    spa->spa_name,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_objsets_visited_this_txg,
 		    (longlong_t)scn->scn_holes_this_txg,
 		    (longlong_t)scn->scn_lt_min_this_txg,
 		    (longlong_t)scn->scn_ddt_contained_this_txg,
 		    (longlong_t)scn->scn_gt_max_this_txg);
 
 		if (!scn->scn_suspending) {
 			ASSERT0(avl_numnodes(&scn->scn_queue));
 			scn->scn_done_txg = tx->tx_txg + 1;
 			if (scn->scn_is_sorted) {
 				scn->scn_checkpointing = B_TRUE;
 				scn->scn_clearing = B_TRUE;
 				scn->scn_issued_before_pass +=
 				    spa->spa_scan_pass_issued;
 				spa_scan_stat_init(spa);
 			}
 			zfs_dbgmsg("scan complete for %s txg %llu",
 			    spa->spa_name,
 			    (longlong_t)tx->tx_txg);
 		}
 	} else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
 		ASSERT(scn->scn_clearing);
 
 		/* need to issue scrubbing IOs from per-vdev queues */
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);
 		scan_io_queues_run(scn);
 		(void) zio_wait(scn->scn_zio_root);
 		scn->scn_zio_root = NULL;
 
 		/* calculate and dprintf the current memory usage */
 		(void) dsl_scan_should_clear(scn);
 		dsl_scan_update_stats(scn);
 
 		zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) "
 		    "in %llums (avg_block_size = %llu, avg_seg_size = %llu)",
 		    (longlong_t)scn->scn_zios_this_txg,
 		    spa->spa_name,
 		    (longlong_t)scn->scn_segs_this_txg,
 		    (longlong_t)NSEC2MSEC(gethrtime() -
 		    scn->scn_sync_start_time),
 		    (longlong_t)scn->scn_avg_zio_size_this_txg,
 		    (longlong_t)scn->scn_avg_seg_size_this_txg);
 	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
 		/* Finished with everything. Mark the scrub as complete */
 		zfs_dbgmsg("scan issuing complete txg %llu for %s",
 		    (longlong_t)tx->tx_txg,
 		    spa->spa_name);
 		ASSERT3U(scn->scn_done_txg, !=, 0);
 		ASSERT0(spa->spa_scrub_inflight);
 		ASSERT0(scn->scn_queues_pending);
 		dsl_scan_done(scn, B_TRUE, tx);
 		sync_type = SYNC_MANDATORY;
 	}
 
 	dsl_scan_sync_state(scn, tx, sync_type);
 }
 
 static void
 count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
 {
 	/*
 	 * Don't count embedded bp's, since we already did the work of
 	 * scanning these when we scanned the containing block.
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Update the spa's stats on how many bytes we have issued.
 	 * Sequential scrubs create a zio for each DVA of the bp. Each
 	 * of these will include all DVAs for repair purposes, but the
 	 * zio code will only try the first one unless there is an issue.
 	 * Therefore, we should only count the first DVA for these IOs.
 	 */
 	atomic_add_64(&spa->spa_scan_pass_issued,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
 {
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	atomic_add_64(&scn->scn_phys.scn_skipped,
 	    all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
 }
 
 static void
 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
 	/*
 	 * If we resume after a reboot, zab will be NULL; don't record
 	 * incomplete stats in that case.
 	 */
 	if (zab == NULL)
 		return;
 
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
 		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
 
 		if (t & DMU_OT_NEWTYPE)
 			t = DMU_OT_OTHER;
 		zfs_blkstat_t *zb = &zab->zab_type[l][t];
 		int equal;
 
 		zb->zb_count++;
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1]))
 				zb->zb_ditto_2_of_2_samevdev++;
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal == 1)
 				zb->zb_ditto_2_of_3_samevdev++;
 			else if (equal == 3)
 				zb->zb_ditto_3_of_3_samevdev++;
 			break;
 		}
 	}
 }
 
 static void
 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
 	avl_index_t idx;
 	dsl_scan_t *scn = queue->q_scn;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
 		atomic_add_64(&scn->scn_queues_pending, 1);
 	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
 		/* block is already scheduled for reading */
 		sio_free(sio);
 		return;
 	}
 	avl_insert(&queue->q_sios_by_addr, sio, idx);
 	queue->q_sio_memused += SIO_GET_MUSED(sio);
 	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
 	    SIO_GET_ASIZE(sio));
 }
 
 /*
  * Given all the info we got from our metadata scanning process, we
  * construct a scan_io_t and insert it into the scan sorting queue. The
  * I/O must already be suitable for us to process. This is controlled
  * by dsl_scan_enqueue().
  */
 static void
 scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
     int zio_flags, const zbookmark_phys_t *zb)
 {
 	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
 
 	ASSERT0(BP_IS_GANG(bp));
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	bp2sio(bp, sio, dva_i);
 	sio->sio_flags = zio_flags;
 	sio->sio_zb = *zb;
 
 	queue->q_last_ext_addr = -1;
 	scan_io_queue_insert_impl(queue, sio);
 }
 
 /*
  * Given a set of I/O parameters as discovered by the metadata traversal
  * process, attempts to place the I/O into the sorted queues (if allowed),
  * or immediately executes the I/O.
  */
 static void
 dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	spa_t *spa = dp->dp_spa;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	/*
 	 * Gang blocks are hard to issue sequentially, so we just issue them
 	 * here immediately instead of queuing them.
 	 */
 	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
 		scan_exec_io(dp, bp, zio_flags, zb, NULL);
 		return;
 	}
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		dva_t dva;
 		vdev_t *vdev;
 
 		dva = bp->blk_dva[i];
 		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
 		ASSERT(vdev != NULL);
 
 		mutex_enter(&vdev->vdev_scan_io_queue_lock);
 		if (vdev->vdev_scan_io_queue == NULL)
 			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
 		ASSERT(dp->dp_scan != NULL);
 		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
 		    i, zio_flags, zb);
 		mutex_exit(&vdev->vdev_scan_io_queue_lock);
 	}
 }
 
 static int
 dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
 	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 
 	count_block(dp->dp_blkstats, bp);
 	if (phys_birth <= scn->scn_phys.scn_min_txg ||
 	    phys_birth >= scn->scn_phys.scn_max_txg) {
 		count_block_skipped(scn, bp, B_TRUE);
 		return (0);
 	}
 
 	/* Embedded BP's have phys_birth==0, so we reject them above. */
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
 	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
 		zio_flags |= ZIO_FLAG_SCRUB;
 		needs_io = B_TRUE;
 	} else {
 		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
 		zio_flags |= ZIO_FLAG_RESILVER;
 		needs_io = B_FALSE;
 	}
 
 	/* If it's an intent log block, failure is expected. */
 	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		const dva_t *dva = &bp->blk_dva[d];
 
 		/*
 		 * Keep track of how much data we've examined so that
 		 * zpool(8) status can make useful progress reports.
 		 */
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		scn->scn_phys.scn_examined += asize;
 		spa->spa_scan_pass_exam += asize;
 
 		/* if it's a resilver, this may not be in the target range */
 		if (!needs_io)
 			needs_io = dsl_scan_need_resilver(spa, dva, psize,
 			    phys_birth);
 	}
 
 	if (needs_io && !zfs_no_scrub_io) {
 		dsl_scan_enqueue(dp, bp, zio_flags, zb);
 	} else {
 		count_block_skipped(scn, bp, B_TRUE);
 	}
 
 	/* do not relocate this block */
 	return (0);
 }
 
 static void
 dsl_scan_scrub_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	dsl_scan_io_queue_t *queue = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	if (queue == NULL) {
 		mutex_enter(&spa->spa_scrub_lock);
 		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
 		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
 		cv_broadcast(&spa->spa_scrub_io_cv);
 		mutex_exit(&spa->spa_scrub_lock);
 	} else {
 		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
 		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
 		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
 		cv_broadcast(&queue->q_zio_cv);
 		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
 	}
 
 	if (zio->io_error && (zio->io_error != ECKSUM ||
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
 		if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
 		    !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan
 			    ->errorscrub_phys.dep_errors);
 		} else {
 			atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
 			    .scn_errors);
 		}
 	}
 }
 
 /*
  * Given a scanning zio's information, executes the zio. The zio need
  * not necessarily be only sortable, this function simply executes the
  * zio, no matter what it is. The optional queue argument allows the
  * caller to specify that they want per top level vdev IO rate limiting
  * instead of the legacy global limiting.
  */
 static void
 scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
     const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
 {
 	spa_t *spa = dp->dp_spa;
 	dsl_scan_t *scn = dp->dp_scan;
 	size_t size = BP_GET_PSIZE(bp);
 	abd_t *data = abd_alloc_for_io(size, B_FALSE);
 	zio_t *pio;
 
 	if (queue == NULL) {
 		ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
 		mutex_exit(&spa->spa_scrub_lock);
 		pio = scn->scn_zio_root;
 	} else {
 		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
 		ASSERT3U(queue->q_maxinflight_bytes, >, 0);
 		mutex_enter(q_lock);
 		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
 			cv_wait(&queue->q_zio_cv, q_lock);
 		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
 		pio = queue->q_zio;
 		mutex_exit(q_lock);
 	}
 
 	ASSERT(pio != NULL);
 	count_block_issued(spa, bp, queue == NULL);
 	zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done,
 	    queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
 }
 
 /*
  * This is the primary extent sorting algorithm. We balance two parameters:
  * 1) how many bytes of I/O are in an extent
  * 2) how well the extent is filled with I/O (as a fraction of its total size)
  * Since we allow extents to have gaps between their constituent I/Os, it's
  * possible to have a fairly large extent that contains the same amount of
  * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
  * The algorithm sorts based on a score calculated from the extent's size,
  * the relative fill volume (in %) and a "fill weight" parameter that controls
  * the split between whether we prefer larger extents or more well populated
  * extents:
  *
  * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
  *
  * Example:
  * 1) assume extsz = 64 MiB
  * 2) assume fill = 32 MiB (extent is half full)
  * 3) assume fill_weight = 3
  * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
  *	SCORE = 32M + (50 * 3 * 32M) / 100
  *	SCORE = 32M + (4800M / 100)
  *	SCORE = 32M + 48M
  *	         ^     ^
  *	         |     +--- final total relative fill-based score
  *	         +--------- final total fill-based score
  *	SCORE = 80M
  *
  * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
  * extents that are more completely filled (in a 3:2 ratio) vs just larger.
  * Note that as an optimization, we replace multiplication and division by
  * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
  *
  * Since we do not care if one extent is only few percent better than another,
  * compress the score into 6 bits via binary logarithm AKA highbit64() and
  * put into otherwise unused due to ashift high bits of offset.  This allows
  * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
  * with single operation.  Plus it makes scrubs more sequential and reduces
  * chances that minor extent change move it within the B-tree.
  */
 __attribute__((always_inline)) inline
 static int
 ext_size_compare(const void *x, const void *y)
 {
 	const uint64_t *a = x, *b = y;
 
 	return (TREE_CMP(*a, *b));
 }
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
     ext_size_compare)
 
 static void
 ext_size_create(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 
 	zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
 	    sizeof (uint64_t));
 }
 
 static void
 ext_size_destroy(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	zfs_btree_t *size_tree = arg;
 	ASSERT0(zfs_btree_numnodes(size_tree));
 
 	zfs_btree_destroy(size_tree);
 }
 
 static uint64_t
 ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
 {
 	(void) rt;
 	uint64_t size = rsg->rs_end - rsg->rs_start;
 	uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
 	    fill_weight * rsg->rs_fill) >> 7);
 	ASSERT3U(rt->rt_shift, >=, 8);
 	return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
 }
 
 static void
 ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
 	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
 	zfs_btree_add(size_tree, &v);
 }
 
 static void
 ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
 	uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
 	zfs_btree_remove(size_tree, &v);
 }
 
 static void
 ext_size_vacate(range_tree_t *rt, void *arg)
 {
 	zfs_btree_t *size_tree = arg;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	ext_size_create(rt, arg);
 }
 
 static const range_tree_ops_t ext_size_ops = {
 	.rtop_create = ext_size_create,
 	.rtop_destroy = ext_size_destroy,
 	.rtop_add = ext_size_add,
 	.rtop_remove = ext_size_remove,
 	.rtop_vacate = ext_size_vacate
 };
 
 /*
  * Comparator for the q_sios_by_addr tree. Sorting is simply performed
  * based on LBA-order (from lowest to highest).
  */
 static int
 sio_addr_compare(const void *x, const void *y)
 {
 	const scan_io_t *a = x, *b = y;
 
 	return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
 }
 
 /* IO queues are created on demand when they are needed. */
 static dsl_scan_io_queue_t *
 scan_io_queue_create(vdev_t *vd)
 {
 	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
 
 	q->q_scn = scn;
 	q->q_vd = vd;
 	q->q_sio_memused = 0;
 	q->q_last_ext_addr = -1;
 	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
 	q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
 	    &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
 	avl_create(&q->q_sios_by_addr, sio_addr_compare,
 	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
 
 	return (q);
 }
 
 /*
  * Destroys a scan queue and all segments and scan_io_t's contained in it.
  * No further execution of I/O occurs, anything pending in the queue is
  * simply freed without being executed.
  */
 void
 dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 {
 	dsl_scan_t *scn = queue->q_scn;
 	scan_io_t *sio;
 	void *cookie = NULL;
 
 	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
 	if (!avl_is_empty(&queue->q_sios_by_addr))
 		atomic_add_64(&scn->scn_queues_pending, -1);
 	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
 	    NULL) {
 		ASSERT(range_tree_contains(queue->q_exts_by_addr,
 		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 		sio_free(sio);
 	}
 
 	ASSERT0(queue->q_sio_memused);
 	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
 	range_tree_destroy(queue->q_exts_by_addr);
 	avl_destroy(&queue->q_sios_by_addr);
 	cv_destroy(&queue->q_zio_cv);
 
 	kmem_free(queue, sizeof (*queue));
 }
 
 /*
  * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
  * called on behalf of vdev_top_transfer when creating or destroying
  * a mirror vdev due to zpool attach/detach.
  */
 void
 dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
 {
 	mutex_enter(&svd->vdev_scan_io_queue_lock);
 	mutex_enter(&tvd->vdev_scan_io_queue_lock);
 
 	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
 	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
 	svd->vdev_scan_io_queue = NULL;
 	if (tvd->vdev_scan_io_queue != NULL)
 		tvd->vdev_scan_io_queue->q_vd = tvd;
 
 	mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	mutex_exit(&svd->vdev_scan_io_queue_lock);
 }
 
 static void
 scan_io_queues_destroy(dsl_scan_t *scn)
 {
 	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *tvd = rvd->vdev_child[i];
 
 		mutex_enter(&tvd->vdev_scan_io_queue_lock);
 		if (tvd->vdev_scan_io_queue != NULL)
 			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
 		tvd->vdev_scan_io_queue = NULL;
 		mutex_exit(&tvd->vdev_scan_io_queue_lock);
 	}
 }
 
 static void
 dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 	vdev_t *vdev;
 	kmutex_t *q_lock;
 	dsl_scan_io_queue_t *queue;
 	scan_io_t *srch_sio, *sio;
 	avl_index_t idx;
 	uint64_t start, size;
 
 	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
 	ASSERT(vdev != NULL);
 	q_lock = &vdev->vdev_scan_io_queue_lock;
 	queue = vdev->vdev_scan_io_queue;
 
 	mutex_enter(q_lock);
 	if (queue == NULL) {
 		mutex_exit(q_lock);
 		return;
 	}
 
 	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
 	bp2sio(bp, srch_sio, dva_i);
 	start = SIO_GET_OFFSET(srch_sio);
 	size = SIO_GET_ASIZE(srch_sio);
 
 	/*
 	 * We can find the zio in two states:
 	 * 1) Cold, just sitting in the queue of zio's to be issued at
 	 *	some point in the future. In this case, all we do is
 	 *	remove the zio from the q_sios_by_addr tree, decrement
 	 *	its data volume from the containing range_seg_t and
 	 *	resort the q_exts_by_size tree to reflect that the
 	 *	range_seg_t has lost some of its 'fill'. We don't shorten
 	 *	the range_seg_t - this is usually rare enough not to be
 	 *	worth the extra hassle of trying keep track of precise
 	 *	extent boundaries.
 	 * 2) Hot, where the zio is currently in-flight in
 	 *	dsl_scan_issue_ios. In this case, we can't simply
 	 *	reach in and stop the in-flight zio's, so we instead
 	 *	block the caller. Eventually, dsl_scan_issue_ios will
 	 *	be done with issuing the zio's it gathered and will
 	 *	signal us.
 	 */
 	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
 	sio_free(srch_sio);
 
 	if (sio != NULL) {
 		blkptr_t tmpbp;
 
 		/* Got it while it was cold in the queue */
 		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
 		ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
 		avl_remove(&queue->q_sios_by_addr, sio);
 		if (avl_is_empty(&queue->q_sios_by_addr))
 			atomic_add_64(&scn->scn_queues_pending, -1);
 		queue->q_sio_memused -= SIO_GET_MUSED(sio);
 
 		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
 		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
 
 		/* count the block as though we skipped it */
 		sio2bp(sio, &tmpbp);
 		count_block_skipped(scn, &tmpbp, B_FALSE);
 
 		sio_free(sio);
 	}
 	mutex_exit(q_lock);
 }
 
 /*
  * Callback invoked when a zio_free() zio is executing. This needs to be
  * intercepted to prevent the zio from deallocating a particular portion
  * of disk space and it then getting reallocated and written to, while we
  * still have it queued up for processing.
  */
 void
 dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dsl_scan_t *scn = dp->dp_scan;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT(scn != NULL);
 	if (!dsl_scan_is_running(scn))
 		return;
 
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
 		dsl_scan_freed_dva(spa, bp, i);
 }
 
 /*
  * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
  * not started, start it. Otherwise, only restart if max txg in DTL range is
  * greater than the max txg in the current scan. If the DTL max is less than
  * the scan max, then the vdev has not missed any new data since the resilver
  * started, so a restart is not needed.
  */
 void
 dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
 {
 	uint64_t min, max;
 
 	if (!vdev_resilver_needed(vd, &min, &max))
 		return;
 
 	if (!dsl_scan_resilvering(dp)) {
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 		return;
 	}
 
 	if (max <= dp->dp_scan->scn_phys.scn_max_txg)
 		return;
 
 	/* restart is needed, check if it can be deferred */
 	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
 		vdev_defer_resilver(vd);
 	else
 		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for scrubs and resilvers");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to scrub per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to obsolete per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to free per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW,
 	"Min millisecs to resilver per txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
 	"Set to prevent scans from progressing");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
 	"Set to disable scrub I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
 	"Set to disable scrub prefetching");
 
 ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW,
 	"Max number of blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW,
 	"Max number of dedup blocks freed in one txg");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
 	"Enable processing of the free_bpobj");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW,
 	"Enable block statistics calculation during scrub");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW,
 	"Fraction of RAM for scan hard limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW,
 	"IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
 	"Scrub using legacy non-sequential method");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW,
 	"Scan progress on-disk checkpointing interval");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW,
 	"Max gap in bytes between sequential scrub / resilver I/Os");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW,
 	"Fraction of hard limit used as soft limit");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
 	"Tunable to attempt to reduce lock contention");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW,
 	"Tunable to adjust bias towards more filled segments during scans");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
 	"Tunable to report resilver performance over the last N txgs");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
 	"Process all resilvers immediately");
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_defer_percent, UINT, ZMOD_RW,
 	"Issued IO percent complete after which resilvers are deferred");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW,
 	"Error blocks to be scrubbed in one txg");
-/* END CSTYLED */
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 3bd6e93e93a4..7affbfac9dc7 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1,6287 +1,6283 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
 #include <sys/btree.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 /*
  * Metaslab granularity, in bytes. This is roughly similar to what would be
  * referred to as the "stripe size" in traditional RAID arrays. In normal
  * operation, we will try to write this amount of data to each disk before
  * moving on to the next top-level vdev.
  */
 static uint64_t metaslab_aliquot = 1024 * 1024;
 
 /*
  * For testing, make some blocks above a certain size be gang blocks.
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  */
 uint_t metaslab_force_ganging_pct = 3;
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 uint_t zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 static const int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 static uint_t zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmentation metric (measured as a percentage) is less than or
  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
  * exceeds this threshold then it will be skipped unless all metaslab
  * groups within the metaslab class have also crossed this threshold.
  *
  * This tunable was introduced to avoid edge cases where we continue
  * allocating from very fragmented disks in our pool while other, less
  * fragmented disks, exists. On the other hand, if all disks in the
  * pool are uniformly approaching the threshold, the threshold can
  * be a speed bump in performance, where we keep switching the disks
  * that we allocate from (e.g. we allocate some segments from disk A
  * making it bypassing the threshold while freeing segments from disk
  * B getting its fragmentation below the threshold).
  *
  * Empirically, we've seen that our vdev selection for allocations is
  * good enough that fragmentation increases uniformly across all vdevs
  * the majority of the time. Thus we set the threshold percentage high
  * enough to avoid hitting the speed bump on pools that are being pushed
  * to the edge.
  */
 static uint_t zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 static uint_t zfs_metaslab_fragmentation_threshold = 70;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = B_FALSE;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 static int metaslab_debug_unload = B_FALSE;
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 uint_t metaslab_df_free_pct = 4;
 
 /*
  * Maximum distance to search forward from the last offset. Without this
  * limit, fragmented pools can see >100,000 iterations and
  * metaslab_block_picker() becomes the performance limiting factor on
  * high-performance storage.
  *
  * With the default setting of 16MB, we typically see less than 500
  * iterations, even with very fragmented, ashift=9 pools. The maximum number
  * of iterations possible is:
  *     metaslab_df_max_search / (2 * (1<<ashift))
  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  * 2048 (with ashift=12).
  */
 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
  * Forces the metaslab_block_picker function to search for at least this many
  * segments forwards until giving up on finding a segment that the allocation
  * will fit into.
  */
 static const uint32_t metaslab_min_search_count = 100;
 
 /*
  * If we are not searching forward (due to metaslab_df_max_search,
  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  * controls what segment is used.  If it is set, we will use the largest free
  * segment.  If it is not set, we will use a segment of exactly the requested
  * size (or larger).
  */
 static int metaslab_df_use_largest_segment = B_FALSE;
 
 /*
  * These tunables control how long a metaslab will remain loaded after the
  * last allocation from it.  A metaslab can't be unloaded until at least
  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
  * unloaded sooner.  These settings are intended to be generous -- to keep
  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
 static uint_t metaslab_unload_delay = 32;
 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
 uint_t metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
  */
 static int metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 static int metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable metaslab group biasing.
  */
 static int metaslab_bias_enabled = B_TRUE;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
 static int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
 static const boolean_t metaslab_trace_enabled = B_FALSE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
  * in a given list when running in non-debug mode. We limit the number
  * of entries in non-debug mode to prevent us from using up too much memory.
  * The limit should be sufficiently large that we don't expect any allocation
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
 static const uint64_t metaslab_trace_max_entries = 5000;
 
 /*
  * Maximum number of metaslabs per group that can be disabled
  * simultaneously.
  */
 static const int max_disabled_ms = 3;
 
 /*
  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  */
 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 
 /*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
 static uint_t zfs_metaslab_mem_limit = 25;
 
 /*
  * Force the per-metaslab range trees to use 64-bit integers to store
  * segments. Used for debugging purposes.
  */
 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 
 /*
  * By default we only store segments over a certain size in the size-sorted
  * metaslab trees (ms_allocatable_by_size and
  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
  * improves load and unload times at the cost of causing us to use slightly
  * larger segments than we would otherwise in some cases.
  */
 static const uint32_t metaslab_by_size_min_shift = 14;
 
 /*
  * If not set, we will first try normal allocation.  If that fails then
  * we will do a gang allocation.  If that fails then we will do a "try hard"
  * gang allocation.  If that fails then we will have a multi-layer gang
  * block.
  *
  * If set, we will first try normal allocation.  If that fails then
  * we will do a "try hard" allocation.  If that fails we will do a gang
  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  * that fails then we will have a multi-layer gang block.
  */
 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 
 /*
  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
  * metaslabs.  This improves performance, especially when there are many
  * metaslabs per vdev and the allocation can't actually be satisfied (so we
  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
  * worse weight but it can actually satisfy the allocation, we won't find it
  * until trying hard.  This may happen if the worse metaslab is not loaded
  * (and the true weight is better than we have calculated), or due to weight
  * bucketization.  E.g. we are looking for a 60K segment, and the best
  * metaslabs all have free segments in the 32-63K bucket, but the best
  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  * bucket, and therefore a lower weight).
  */
 static uint_t zfs_metaslab_find_max_tries = 100;
 
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
 kmem_cache_t *metaslab_alloc_trace_cache;
 
 typedef struct metaslab_stats {
 	kstat_named_t metaslabstat_trace_over_limit;
 	kstat_named_t metaslabstat_reload_tree;
 	kstat_named_t metaslabstat_too_many_tries;
 	kstat_named_t metaslabstat_try_hard;
 } metaslab_stats_t;
 
 static metaslab_stats_t metaslab_stats = {
 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
 	{ "reload_tree",		KSTAT_DATA_UINT64 },
 	{ "too_many_tries",		KSTAT_DATA_UINT64 },
 	{ "try_hard",			KSTAT_DATA_UINT64 },
 };
 
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
 
 static kstat_t *metaslab_ksp;
 
 void
 metaslab_stat_init(void)
 {
 	ASSERT(metaslab_alloc_trace_cache == NULL);
 	metaslab_alloc_trace_cache = kmem_cache_create(
 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
 	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (metaslab_ksp != NULL) {
 		metaslab_ksp->ks_data = &metaslab_stats;
 		kstat_install(metaslab_ksp);
 	}
 }
 
 void
 metaslab_stat_fini(void)
 {
 	if (metaslab_ksp != NULL) {
 		kstat_delete(metaslab_ksp);
 		metaslab_ksp = NULL;
 	}
 
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		mca->mca_rotor = NULL;
 		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
 	}
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		ASSERT(mca->mca_rotor == NULL);
 		zfs_refcount_destroy(&mca->mca_alloc_slots);
 	}
 	mutex_destroy(&mc->mc_lock);
 	multilist_destroy(&mc->mc_metaslab_txg_list);
 	kmem_free(mc, offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]));
 }
 
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
 		return (0);
 
 	do {
 		vd = mg->mg_vd;
 		ASSERT(vd->vdev_mg != NULL);
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
 
 	return (0);
 }
 
 static void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	mutex_enter(&mc->mc_lock);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 	}
 
 	mutex_exit(&mc->mc_lock);
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels,
 		 * or vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * Calculate if we have enough space to add additional
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
 		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
 		    1ULL << tvd->vdev_ms_shift, uint64_t);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	hrtime_t now = gethrtime();
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 
 			/*
 			 * If the metaslab has been removed from the list
 			 * (which could happen if we were at the memory limit
 			 * and it was evicted during this loop), then we can't
 			 * proceed and we should restart the sublist.
 			 */
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				mutex_exit(&msp->ms_lock);
 				i--;
 				break;
 			}
 			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
 			    now > msp->ms_selected_time +
 			    MSEC2NSEC(metaslab_unload_delay_ms) &&
 			    (msp->ms_allocator == -1 ||
 			    !metaslab_preload_enabled)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
 				 * Once we've hit a metaslab selected too
 				 * recently to evict, we're done evicting for
 				 * now.
 				 */
 				mutex_exit(&msp->ms_lock);
 				break;
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 		}
 	}
 }
 
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = (const metaslab_t *)x1;
 	const metaslab_t *m2 = (const metaslab_t *)x2;
 
 	int sort1 = 0;
 	int sort2 = 0;
 	if (m1->ms_allocator != -1 && m1->ms_primary)
 		sort1 = 1;
 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
 		sort1 = 2;
 	if (m2->ms_allocator != -1 && m2->ms_primary)
 		sort2 = 1;
 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
 		sort2 = 2;
 
 	/*
 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
 	 * selecting a metaslab to allocate from, an allocator first tries its
 	 * primary, then secondary active metaslab. If it doesn't have active
 	 * metaslabs, or can't allocate from them, it searches for an inactive
 	 * metaslab to activate. If it can't find a suitable one, it will steal
 	 * a primary or secondary metaslab from another allocator.
 	 */
 	if (sort1 < sort2)
 		return (-1);
 	if (sort1 > sort2)
 		return (1);
 
 	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
 	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 
 	return (TREE_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  * transitions from allocatable to non-allocatable or vice versa then the
  * metaslab group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 	boolean_t was_initialized;
 
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 	    SCL_ALLOC);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 	was_initialized = mg->mg_initialized;
 
 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	mutex_enter(&mc->mc_lock);
 
 	/*
 	 * If the metaslab group was just added then it won't
 	 * have any space until we finish syncing out this txg.
 	 * At that point we will consider it initialized and available
 	 * for allocations.  We also don't consider non-activated
 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
 	 * to be initialized, because they can't be used for allocation.
 	 */
 	mg->mg_initialized = metaslab_group_initialized(mg);
 	if (!was_initialized && mg->mg_initialized) {
 		mc->mc_groups++;
 	} else if (was_initialized && !mg->mg_initialized) {
 		ASSERT3U(mc->mc_groups, >, 0);
 		mc->mc_groups--;
 	}
 	if (mg->mg_initialized)
 		mg->mg_no_free_space = B_FALSE;
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 	mutex_exit(&mc->mc_lock);
 
 	mutex_exit(&mg->mg_lock);
 }
 
 int
 metaslab_sort_by_flushed(const void *va, const void *vb)
 {
 	const metaslab_t *a = va;
 	const metaslab_t *b = vb;
 
 	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 	if (likely(cmp))
 		return (cmp);
 
 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 	if (cmp)
 		return (cmp);
 
 	return (TREE_CMP(a->ms_id, b->ms_id));
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(offsetof(metaslab_group_t,
 	    mg_allocator[allocators]), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
 	for (int i = 0; i < allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
 	cv_destroy(&mg->mg_ms_disabled_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
 	}
 	kmem_free(mg, offsetof(metaslab_group_t,
 	    mg_allocator[mg->mg_allocators]));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1,
 	    vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd));
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mc->mc_allocator[i].mca_rotor = mg;
 		mg = mg->mg_next;
 	}
 }
 
 /*
  * Passivate a metaslab group and remove it from the allocation rotor.
  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  * a metaslab group. This function will momentarily drop spa_config_locks
  * that are lower than the SCL_ALLOC lock (see comment below).
  */
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
 		for (int i = 0; i < spa->spa_alloc_count; i++)
 			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	/*
 	 * The spa_config_lock is an array of rwlocks, ordered as
 	 * follows (from highest to lowest):
 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
 	 * (For more information about the spa_config_lock see spa_misc.c)
 	 * The higher the lock, the broader its coverage. When we passivate
 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 	 * config locks. However, the metaslab group's taskq might be trying
 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
 	 * lower locks to allow the I/O to complete. At a minimum,
 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		metaslab_t *msp = mga->mga_primary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mgnext = NULL;
 	} else {
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		if (mc->mc_allocator[i].mca_rotor == mg)
 			mc->mc_allocator[i].mca_rotor = mgnext;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 }
 
 boolean_t
 metaslab_group_initialized(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	/*
 	 * Note that the number of nodes in mg_metaslab_tree may be one less
 	 * than vdev_ms_count, due to the embedded log metaslab.
 	 */
 	mutex_enter(&mg->mg_lock);
 	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
 	mutex_exit(&mg->mg_lock);
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	mutex_enter(&mg->mg_lock);
 	for (metaslab_t *msp = avl_first(t);
 	    msp != NULL; msp = AVL_NEXT(t, msp)) {
 		VERIFY3P(msp->ms_group, ==, mg);
 		/* skip if not active */
 		if (msp->ms_sm == NULL)
 			continue;
 
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 		}
 	}
 
 	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	mutex_exit(&mg->mg_lock);
 
 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	multilist_sublist_unlock(mls);
 
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
 
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group. We can use
  * a simple average here since all metaslabs within the group must have
  * the same size. The return value will be a value between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 		if (msp->ms_group != mg)
 			continue;
 
 		valid_ms++;
 		fragmentation += msp->ms_fragmentation;
 	}
 
 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= valid_ms;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Determine if a given metaslab group should skip allocations. A metaslab
  * group should avoid allocations if its free capacity is less than the
  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
  * that can still handle allocations. If the allocation throttle is enabled
  * then we skip allocations to devices that have reached their maximum
  * allocation queue depth unless the selected metaslab group is the only
  * eligible group remaining.
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
     int flags, uint64_t psize, int allocator, int d)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
 
 	/*
 	 * We can only consider skipping this metaslab group if it's
 	 * in the normal metaslab class and there are other metaslab
 	 * groups to select from. Otherwise, we always consider it eligible
 	 * for allocations.
 	 */
 	if ((mc != spa_normal_class(spa) &&
 	    mc != spa_special_class(spa) &&
 	    mc != spa_dedup_class(spa)) ||
 	    mc->mc_groups <= 1)
 		return (B_TRUE);
 
 	/*
 	 * If the metaslab group's mg_allocatable flag is set (see comments
 	 * in metaslab_group_alloc_update() for more information) and
 	 * the allocation throttle is disabled then allow allocations to this
 	 * device. However, if the allocation throttle is enabled then
 	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
 	 * to determine if we should allow allocations to this metaslab group.
 	 * If all metaslab groups are no longer considered allocatable
 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 	 * gang block size then we allow allocations on this metaslab group
 	 * regardless of the mg_allocatable or throttle settings.
 	 */
 	if (mg->mg_allocatable) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		int64_t qdepth;
 		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
 
 		/*
 		 * If this metaslab group does not have any free space, then
 		 * there is no point in looking further.
 		 */
 		if (mg->mg_no_free_space)
 			return (B_FALSE);
 
 		/*
 		 * Some allocations (e.g., those coming from device removal
 		 * where the * allocations are not even counted in the
 		 * metaslab * allocation queues) are allowed to bypass
 		 * the throttle.
 		 */
 		if (flags & METASLAB_DONT_THROTTLE)
 			return (B_TRUE);
 
 		/*
 		 * Relax allocation throttling for ditto blocks.  Due to
 		 * random imbalances in allocation it tends to push copies
 		 * to one vdev, that looks a bit better at the moment.
 		 */
 		qmax = qmax * (4 + d) / 4;
 
 		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
 		 * the only allocatable metaslab group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
 			return (B_TRUE);
 		ASSERT3U(mc->mc_alloc_groups, >, 1);
 
 		/*
 		 * Since this metaslab group is at or over its qmax, we
 		 * need to determine if there are metaslab groups after this
 		 * one that might be able to handle this allocation. This is
 		 * racy since we can't hold the locks for all metaslab
 		 * groups at the same time when we make this check.
 		 */
 		for (metaslab_group_t *mgp = mg->mg_next;
 		    mgp != rotor; mgp = mgp->mg_next) {
 			metaslab_group_allocator_t *mgap =
 			    &mgp->mg_allocator[allocator];
 			qmax = mgap->mga_cur_max_alloc_queue_depth;
 			qmax = qmax * (4 + d) / 4;
 			qdepth =
 			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
 
 			/*
 			 * If there is another metaslab group that
 			 * might be able to handle the allocation, then
 			 * we return false so that we skip this group.
 			 */
 			if (qdepth < qmax && !mgp->mg_no_free_space)
 				return (B_FALSE);
 		}
 
 		/*
 		 * We didn't find another group to handle the allocation
 		 * so we can't skip this metaslab group even though
 		 * we are at or over our qmax.
 		 */
 		return (B_TRUE);
 
 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree using 32-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize32_compare(const void *x1, const void *x2)
 {
 	const range_seg32_t *r1 = x1;
 	const range_seg32_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * Comparison function for the private size-ordered tree using 64-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
 	const range_seg64_t *r1 = x1;
 	const range_seg64_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 typedef struct metaslab_rt_arg {
 	zfs_btree_t *mra_bt;
 	uint32_t mra_floor_shift;
 } metaslab_rt_arg_t;
 
 struct mssa_arg {
 	range_tree_t *rt;
 	metaslab_rt_arg_t *mra;
 };
 
 static void
 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
 {
 	struct mssa_arg *mssap = arg;
 	range_tree_t *rt = mssap->rt;
 	metaslab_rt_arg_t *mrap = mssap->mra;
 	range_seg_max_t seg = {0};
 	rs_set_start(&seg, rt, start);
 	rs_set_end(&seg, rt, start + size);
 	metaslab_rt_add(rt, &seg, mrap);
 }
 
 static void
 metaslab_size_tree_full_load(range_tree_t *rt)
 {
 	metaslab_rt_arg_t *mrap = rt->rt_arg;
 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 	mrap->mra_floor_shift = 0;
 	struct mssa_arg arg = {0};
 	arg.rt = rt;
 	arg.mra = mrap;
 	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
     range_seg32_t, metaslab_rangesize32_compare)
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
     range_seg64_t, metaslab_rangesize64_compare)
 
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
 static void
 metaslab_rt_create(range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	size_t size;
 	int (*compare) (const void *, const void *);
 	bt_find_in_buf_f bt_find;
 	switch (rt->rt_type) {
 	case RANGE_SEG32:
 		size = sizeof (range_seg32_t);
 		compare = metaslab_rangesize32_compare;
 		bt_find = metaslab_rt_find_rangesize32_in_buf;
 		break;
 	case RANGE_SEG64:
 		size = sizeof (range_seg64_t);
 		compare = metaslab_rangesize64_compare;
 		bt_find = metaslab_rt_find_rangesize64_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", rt->rt_type);
 	}
 	zfs_btree_create(size_tree, compare, bt_find, size);
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 }
 
 static void
 metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	zfs_btree_destroy(size_tree);
 	kmem_free(mrap, sizeof (*mrap));
 }
 
 static void
 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
 	    (1ULL << mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_add(size_tree, rs);
 }
 
 static void
 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL <<
 	    mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_remove(size_tree, rs);
 }
 
 static void
 metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	metaslab_rt_create(rt, arg);
 }
 
 static const range_tree_ops_t metaslab_rt_ops = {
 	.rtop_create = metaslab_rt_create,
 	.rtop_destroy = metaslab_rt_destroy,
 	.rtop_add = metaslab_rt_add,
 	.rtop_remove = metaslab_rt_remove,
 	.rtop_vacate = metaslab_rt_vacate
 };
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_largest_allocatable(metaslab_t *msp)
 {
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
 	if (t == NULL)
 		return (0);
 	if (zfs_btree_numnodes(t) == 0)
 		metaslab_size_tree_full_load(msp->ms_allocatable);
 
 	rs = zfs_btree_last(t, NULL);
 	if (rs == NULL)
 		return (0);
 
 	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
 	    msp->ms_allocatable));
 }
 
 /*
  * Return the maximum contiguous segment within the unflushed frees of this
  * metaslab.
  */
 static uint64_t
 metaslab_largest_unflushed_free(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (msp->ms_unflushed_frees == NULL)
 		return (0);
 
 	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
 		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
 	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
 	    NULL);
 	if (rs == NULL)
 		return (0);
 
 	/*
 	 * When a range is freed from the metaslab, that range is added to
 	 * both the unflushed frees and the deferred frees. While the block
 	 * will eventually be usable, if the metaslab were loaded the range
 	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
 	 * txgs had passed.  As a result, when attempting to estimate an upper
 	 * bound for the largest currently-usable free segment in the
 	 * metaslab, we need to not consider any ranges currently in the defer
 	 * trees. This algorithm approximates the largest available chunk in
 	 * the largest range in the unflushed_frees tree by taking the first
 	 * chunk.  While this may be a poor estimate, it should only remain so
 	 * briefly and should eventually self-correct as frees are no longer
 	 * deferred. Similar logic applies to the ms_freed tree. See
 	 * metaslab_load() for more details.
 	 *
 	 * There are two primary sources of inaccuracy in this estimate. Both
 	 * are tolerated for performance reasons. The first source is that we
 	 * only check the largest segment for overlaps. Smaller segments may
 	 * have more favorable overlaps with the other trees, resulting in
 	 * larger usable chunks.  Second, we only look at the first chunk in
 	 * the largest segment; there may be other usable chunks in the
 	 * largest segment, but we ignore them.
 	 */
 	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
 	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		uint64_t start = 0;
 		uint64_t size = 0;
 		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
 		    rsize, &start, &size);
 		if (found) {
 			if (rstart == start)
 				return (0);
 			rsize = start - rstart;
 		}
 	}
 
 	uint64_t start = 0;
 	uint64_t size = 0;
 	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
 	    rsize, &start, &size);
 	if (found)
 		rsize = start - rstart;
 
 	return (rsize);
 }
 
 static range_seg_t *
 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
     uint64_t size, zfs_btree_index_t *where)
 {
 	range_seg_t *rs;
 	range_seg_max_t rsearch;
 
 	rs_set_start(&rsearch, rt, start);
 	rs_set_end(&rsearch, rt, start + size);
 
 	rs = zfs_btree_find(t, &rsearch, where);
 	if (rs == NULL) {
 		rs = zfs_btree_next(t, where, where);
 	}
 
 	return (rs);
 }
 
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
  * for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
     uint64_t max_search)
 {
 	if (*cursor == 0)
 		*cursor = rt->rt_start;
 	zfs_btree_t *bt = &rt->rt_root;
 	zfs_btree_index_t where;
 	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
 	uint64_t first_found;
 	int count_searched = 0;
 
 	if (rs != NULL)
 		first_found = rs_get_start(rs, rt);
 
 	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
 	    max_search || count_searched < metaslab_min_search_count)) {
 		uint64_t offset = rs_get_start(rs, rt);
 		if (offset + size <= rs_get_end(rs, rt)) {
 			*cursor = offset + size;
 			return (offset);
 		}
 		rs = zfs_btree_next(bt, &where, &where);
 		count_searched++;
 	}
 
 	*cursor = 0;
 	return (-1ULL);
 }
 
 static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
 static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
 metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 static metaslab_ops_t metaslab_allocators[] = {
 	{ "dynamic", metaslab_df_alloc },
 	{ "cursor", metaslab_cf_alloc },
 	{ "new-dynamic", metaslab_ndf_alloc },
 };
 
 static int
 spa_find_allocator_byname(const char *val)
 {
 	int a = ARRAY_SIZE(metaslab_allocators) - 1;
 	if (strcmp("new-dynamic", val) == 0)
 		return (-1); /* remove when ndf is working */
 	for (; a >= 0; a--) {
 		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
 			return (a);
 	}
 	return (-1);
 }
 
 void
 spa_set_allocator(spa_t *spa, const char *allocator)
 {
 	int a = spa_find_allocator_byname(allocator);
 	if (a < 0) a = 0;
 	spa->spa_active_allocator = a;
 	zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name);
 }
 
 int
 spa_get_allocator(spa_t *spa)
 {
 	return (spa->spa_active_allocator);
 }
 
 #if defined(_KERNEL)
 int
 param_set_active_allocator_common(const char *val)
 {
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	int a = spa_find_allocator_byname(val);
 	if (a < 0)
 		return (SET_ERROR(EINVAL));
 
 	zfs_active_allocator = metaslab_allocators[a].msop_name;
 	return (0);
 }
 #endif
 
 metaslab_ops_t *
 metaslab_allocator(spa_t *spa)
 {
 	int allocator = spa_get_allocator(spa);
 	return (&metaslab_allocators[allocator]);
 }
 
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
  *
  * Search for a free chunk of at least this size, starting from the last
  * offset (for this alignment of block) looking for up to
  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
  * found within 16MB, then return a free chunk of exactly the requested size (or
  * larger).
  *
  * If it seems like searching from the last offset will be unproductive, skip
  * that and just return a free chunk of exactly the requested size (or larger).
  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
  * mechanism is probably not very useful and may be removed in the future.
  *
  * The behavior when not searching can be changed to return the largest free
  * chunk, instead of a free chunk of exactly the requested size, by setting
  * metaslab_df_use_largest_segment.
  * ==========================================================================
  */
 static uint64_t
 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_allocatable;
 	uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
 	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		offset = -1;
 	} else {
 		offset = metaslab_block_picker(rt,
 		    cursor, size, metaslab_df_max_search);
 	}
 
 	if (offset == -1) {
 		range_seg_t *rs;
 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 		} else {
 			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
 			    rt, msp->ms_start, size, &where);
 		}
 		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
 		    rt)) {
 			offset = rs_get_start(rs, rt);
 			*cursor = offset + size;
 		}
 	}
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
 		if (zfs_btree_numnodes(t) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 		rs = zfs_btree_last(t, NULL);
 		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
 		    size)
 			return (-1ULL);
 
 		*cursor = rs_get_start(rs, rt);
 		*cursor_end = rs_get_end(rs, rt);
 	}
 
 	offset = *cursor;
 	*cursor += size;
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
 	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
 	range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_index_t where;
 	range_seg_t *rs;
 	range_seg_max_t rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	rs_set_start(&rsearch, rt, *cursor);
 	rs_set_end(&rsearch, rt, *cursor + size);
 
 	rs = zfs_btree_find(t, &rsearch, &where);
 	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
 		t = &msp->ms_allocatable_by_size;
 
 		rs_set_start(&rsearch, rt, 0);
 		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
 		    metaslab_ndf_clump_shift)));
 
 		rs = zfs_btree_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = zfs_btree_next(t, &where, &where);
 		ASSERT(rs != NULL);
 	}
 
 	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
 		*cursor = rs_get_start(rs, rt) + size;
 		return (rs_get_start(rs, rt));
 	}
 	return (-1ULL);
 }
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 /*
  * Wait for any in-progress flushing to complete.
  */
 static void
 metaslab_flush_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_flushing)
 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 }
 
 static unsigned int
 metaslab_idx_func(multilist_t *ml, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	/*
 	 * ms_id values are allocated sequentially, so full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
 }
 
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
 	return (msp->ms_allocated_space);
 }
 
 /*
  * Verify that the space accounting on disk matches the in-core range_trees.
  */
 static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_condensing);
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can only verify the metaslab space when we're called
 	 * from syncing context with a loaded metaslab that has an
 	 * allocated space map. Calling this in non-syncing context
 	 * does not provide a consistent view of the metaslab since
 	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;
 
 	/*
 	 * Even though the smp_alloc field can get negative,
 	 * when it comes to a metaslab's space map, that should
 	 * never be the case.
 	 */
 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 
 	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
 	    range_tree_space(msp->ms_unflushed_frees));
 
 	ASSERT3U(metaslab_allocated_space(msp), ==,
 	    space_map_allocated(msp->ms_sm) +
 	    range_tree_space(msp->ms_unflushed_allocs) -
 	    range_tree_space(msp->ms_unflushed_frees));
 
 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 
 	/*
 	 * Account for future allocations since we would have
 	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocating +=
 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}
 	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
 	    msp->ms_allocating_total);
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    range_tree_space(msp->ms_defer[0]) +
 	    range_tree_space(msp->ms_defer[1]));
 
 	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
 
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
 	/*
 	 * Auxiliary histograms are only cleared when resetting them,
 	 * which can only happen while the metaslab is loaded.
 	 */
 	ASSERT(msp->ms_loaded);
 
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
 		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 }
 
 static void
 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     range_tree_t *rt)
 {
 	/*
 	 * This is modeled after space_map_histogram_add(), so refer to that
 	 * function for implementation details. We want this to work like
 	 * the space map histogram, and not the range tree histogram, as we
 	 * are essentially constructing a delta that will be later subtracted
 	 * from the space map histogram.
 	 */
 	int idx = 0;
 	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(i, >=, idx + shift);
 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 /*
  * Called at every sync pass that the metaslab gets synced.
  *
  * The reason is that we want our auxiliary histograms to be updated
  * wherever the metaslab's space map histogram is updated. This way
  * we stay consistent on which parts of the metaslab space map's
  * histogram are currently not available for allocations (e.g because
  * they are in the defer, freed, and freeing trees).
  */
 static void
 metaslab_aux_histograms_update(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(sm != NULL);
 
 	/*
 	 * This is similar to the metaslab's space map histogram updates
 	 * that take place in metaslab_sync(). The only difference is that
 	 * we only care about segments that haven't made it into the
 	 * ms_allocatable tree yet.
 	 */
 	if (msp->ms_loaded) {
 		metaslab_aux_histograms_clear(msp);
 
 		metaslab_aux_histogram_add(msp->ms_synchist,
 		    sm->sm_shift, msp->ms_freed);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
 			    sm->sm_shift, msp->ms_defer[t]);
 		}
 	}
 
 	metaslab_aux_histogram_add(msp->ms_synchist,
 	    sm->sm_shift, msp->ms_freeing);
 }
 
 /*
  * Called every time we are done syncing (writing to) the metaslab,
  * i.e. at the end of each sync pass.
  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
  */
 static void
 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 
 	if (sm == NULL) {
 		/*
 		 * We came here from metaslab_init() when creating/opening a
 		 * pool, looking at a metaslab that hasn't had any allocations
 		 * yet.
 		 */
 		return;
 	}
 
 	/*
 	 * This is similar to the actions that we take for the ms_freed
 	 * and ms_defer trees in metaslab_sync_done().
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
 		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 		    sizeof (msp->ms_synchist));
 	} else {
 		memset(msp->ms_deferhist[hist_index], 0,
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 }
 
 /*
  * Ensure that the metaslab's weight and fragmentation are consistent
  * with the contents of the histogram (either the range tree's histogram
  * or the space map's depending whether the metaslab is loaded).
  */
 static void
 metaslab_verify_weight_and_frag(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can end up here from vdev_remove_complete(), in which case we
 	 * cannot do these assertions because we hold spa config locks and
 	 * thus we are not allowed to read from the DMU.
 	 *
 	 * We check if the metaslab group has been removed and if that's
 	 * the case we return immediately as that would mean that we are
 	 * here from the aforementioned code path.
 	 */
 	if (msp->ms_group == NULL)
 		return;
 
 	/*
 	 * Devices being removed always return a weight of 0 and leave
 	 * fragmentation and ms_max_size as is - there is nothing for
 	 * us to verify here.
 	 */
 	vdev_t *vd = msp->ms_group->mg_vd;
 	if (vd->vdev_removing)
 		return;
 
 	/*
 	 * If the metaslab is dirty it probably means that we've done
 	 * some allocations or frees that have changed our histograms
 	 * and thus the weight.
 	 */
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
 			return;
 	}
 
 	/*
 	 * This verification checks that our in-memory state is consistent
 	 * with what's on disk. If the pool is read-only then there aren't
 	 * any changes and we just have the initially-loaded state.
 	 */
 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 		return;
 
 	/* some extra verification for in-core tree if you can */
 	if (msp->ms_loaded) {
 		range_tree_stat_verify(msp->ms_allocatable);
 		VERIFY(space_map_histogram_verify(msp->ms_sm,
 		    msp->ms_allocatable));
 	}
 
 	uint64_t weight = msp->ms_weight;
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 	uint64_t frag = msp->ms_fragmentation;
 	uint64_t max_segsize = msp->ms_max_size;
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
 
 	/*
 	 * This function is used for verification purposes and thus should
 	 * not introduce any side-effects/mutations on the system's state.
 	 *
 	 * Regardless of whether metaslab_weight() thinks this metaslab
 	 * should be active or not, we want to ensure that the actual weight
 	 * (and therefore the value of ms_weight) would be the same if it
 	 * was to be recalculated at this point.
 	 *
 	 * In addition we set the nodirty flag so metaslab_weight() does
 	 * not dirty the metaslab for future TXGs (e.g. when trying to
 	 * force condensing to upgrade the metaslab spacemaps).
 	 */
 	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
 
 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
 
 	/*
 	 * If the weight type changed then there is no point in doing
 	 * verification. Revert fields to their original values.
 	 */
 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 		msp->ms_fragmentation = frag;
 		msp->ms_weight = weight;
 		return;
 	}
 
 	VERIFY3U(msp->ms_fragmentation, ==, frag);
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
 /*
  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
  * this class that was used longest ago, and attempt to unload it.  We don't
  * want to spend too much time in this loop to prevent performance
  * degradation, and we expect that most of the time this operation will
  * succeed. Between that and the normal unloading processing during txg sync,
  * we expect this to keep the metaslab memory usage under control.
  */
 static void
 metaslab_potentially_evict(metaslab_class_t *mc)
 {
 #ifdef _KERNEL
 	uint64_t allmem = arc_all_memory();
 	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
 	uint_t tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 	    tries++) {
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
 			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
 
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				multilist_sublist_unlock(mls);
 				break;
 			}
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			/*
 			 * If the metaslab is currently loading there are two
 			 * cases. If it's the metaslab we're evicting, we
 			 * can't continue on or we'll panic when we attempt to
 			 * recursively lock the mutex. If it's another
 			 * metaslab that's loading, it can be safely skipped,
 			 * since we know it's very new and therefore not a
 			 * good eviction candidate. We check later once the
 			 * lock is held that the metaslab is fully loaded
 			 * before actually unloading it.
 			 */
 			if (msp->ms_loading) {
 				msp = next_msp;
 				inuse =
 				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 				continue;
 			}
 			/*
 			 * We can't unload metaslabs with no spacemap because
 			 * they're not ready to be unloaded yet. We can't
 			 * unload metaslabs with outstanding allocations
 			 * because doing so could cause the metaslab's weight
 			 * to decrease while it's unloaded, which violates an
 			 * invariant that we use to prevent unnecessary
 			 * loading. We also don't unload metaslabs that are
 			 * currently active because they are high-weight
 			 * metaslabs that are likely to be used in the near
 			 * future.
 			 */
 			mutex_enter(&msp->ms_lock);
 			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
 			    msp->ms_allocating_total == 0) {
 				metaslab_unload(msp);
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 		}
 	}
 #else
 	(void) mc, (void) zfs_metaslab_mem_limit;
 #endif
 }
 
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We temporarily drop the lock to unblock other operations while we
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
 	 * If we are using the log space maps, metaslab_sync() can't write to
 	 * the metaslab's space map while we are loading as we only write to
 	 * it when we are flushing the metaslab, and that can't happen while
 	 * we are loading it.
 	 *
 	 * If we are not using log space maps though, metaslab_sync() can
 	 * append to the space map while we are loading. Therefore we load
 	 * only entries that existed when we started the load. Additionally,
 	 * metaslab_sync_done() has to wait for the load to complete because
 	 * there are potential races like metaslab_load() loading parts of the
 	 * space map that are currently being appended by metaslab_sync(). If
 	 * we didn't, the ms_allocatable would have entries that
 	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
 	 * ignoring entries appended by metaslab_sync() that happen after we
 	 * drop the lock.
 	 */
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
 	hrtime_t load_start = gethrtime();
 	metaslab_rt_arg_t *mrap;
 	if (msp->ms_allocatable->rt_arg == NULL) {
 		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	} else {
 		mrap = msp->ms_allocatable->rt_arg;
 		msp->ms_allocatable->rt_ops = NULL;
 		msp->ms_allocatable->rt_arg = NULL;
 	}
 	mrap->mra_bt = &msp->ms_allocatable_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
 
 		/* Now, populate the size-sorted tree. */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 
 		struct mssa_arg arg = {0};
 		arg.rt = msp->ms_allocatable;
 		arg.mra = mrap;
 		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
 		    &arg);
 	} else {
 		/*
 		 * Add the size-sorted tree first, since we don't need to load
 		 * the metaslab from the spacemap.
 		 */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
 		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 
 		if (msp->ms_new) {
 			/*
 			 * If the ms_sm doesn't exist, this means that this
 			 * metaslab hasn't gone through metaslab_sync() and
 			 * thus has never been dirtied. So we shouldn't
 			 * expect any unflushed allocs or frees from previous
 			 * TXGs.
 			 */
 			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 	 * changing the ms_sm (or log_sm) and the metaslab's range trees
 	 * while we are about to use them and populate the ms_allocatable.
 	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
 	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	ASSERT(!msp->ms_condensing);
 	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
 		return (error);
 	}
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;
 
 	/*
 	 * Apply all the unflushed changes to ms_allocatable right
 	 * away so any manipulations we do below have a clear view
 	 * of what is allocated and what is free.
 	 */
 	range_tree_walk(msp->ms_unflushed_allocs,
 	    range_tree_remove, msp->ms_allocatable);
 	range_tree_walk(msp->ms_unflushed_frees,
 	    range_tree_add, msp->ms_allocatable);
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	if (spa_syncing_log_sm(spa) != NULL) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LOG_SPACEMAP));
 
 		/*
 		 * If we use a log space map we add all the segments
 		 * that are in ms_unflushed_frees so they are available
 		 * for allocation.
 		 *
 		 * ms_allocatable needs to contain all free segments
 		 * that are ready for allocations (thus not segments
 		 * from ms_freeing, ms_freed, and the ms_defer trees).
 		 * But if we grab the lock in this code path at a sync
 		 * pass later that 1, then it also contains the
 		 * segments of ms_freed (they were added to it earlier
 		 * in this path through ms_unflushed_frees). So we
 		 * need to remove all the segments that exist in
 		 * ms_freed from ms_allocatable as they will be added
 		 * later in metaslab_sync_done().
 		 *
 		 * When there's no log space map, the ms_allocatable
 		 * correctly doesn't contain any segments that exist
 		 * in ms_freed [see ms_synced_length].
 		 */
 		range_tree_walk(msp->ms_freed,
 		    range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * If we are not using the log space map, ms_allocatable
 	 * contains the segments that exist in the ms_defer trees
 	 * [see ms_synced_length]. Thus we need to remove them
 	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
 	 *
 	 * If we are using the log space map, ms_allocatable still
 	 * contains the segments that exist in the ms_defer trees.
 	 * Not because it read them through the ms_sm though. But
 	 * because these segments are part of ms_unflushed_frees
 	 * whose segments we add to ms_allocatable earlier in this
 	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * Call metaslab_recalculate_weight_and_sort() now that the
 	 * metaslab is loaded so we get the metaslab's real weight.
 	 *
 	 * Unless this metaslab was created with older software and
 	 * has not yet been converted to use segment-based weight, we
 	 * expect the new weight to be better or equal to the weight
 	 * that the metaslab had while it was not loaded. This is
 	 * because the old weight does not take into account the
 	 * consolidation of adjacent segments between TXGs. [see
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
 	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
 	msp->ms_load_time = load_end;
 	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
 	    "ms_id %llu, smp_length %llu, "
 	    "unflushed_allocs %llu, unflushed_frees %llu, "
 	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
 	    "loading_time %lld ms, ms_max_size %llu, "
 	    "max size error %lld, "
 	    "old_weight %llx, new_weight %llx",
 	    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    (u_longlong_t)msp->ms_id,
 	    (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 	    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 	    (u_longlong_t)range_tree_space(msp->ms_freed),
 	    (u_longlong_t)range_tree_space(msp->ms_defer[0]),
 	    (u_longlong_t)range_tree_space(msp->ms_defer[1]),
 	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
 	    (longlong_t)((load_end - load_start) / 1000000),
 	    (u_longlong_t)msp->ms_max_size,
 	    (u_longlong_t)msp->ms_max_size - max_size,
 	    (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
 	return (0);
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * There may be another thread loading the same metaslab, if that's
 	 * the case just wait until the other thread is done and return.
 	 */
 	metaslab_load_wait(msp);
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We set the loading flag BEFORE potentially dropping the lock to
 	 * wait for an ongoing flush (see ms_flushing below). This way other
 	 * threads know that there is already a thread that is loading this
 	 * metaslab.
 	 */
 	msp->ms_loading = B_TRUE;
 
 	/*
 	 * Wait for any in-progress flushing to finish as we drop the ms_lock
 	 * both here (during space_map_load()) and in metaslab_flush() (when
 	 * we flush our changes to the ms_sm).
 	 */
 	if (msp->ms_flushing)
 		metaslab_flush_wait(msp);
 
 	/*
 	 * In the possibility that we were waiting for the metaslab to be
 	 * flushed (where we temporarily dropped the ms_lock), ensure that
 	 * no one else loaded the metaslab somehow.
 	 */
 	ASSERT(!msp->ms_loaded);
 
 	/*
 	 * If we're loading a metaslab in the normal class, consider evicting
 	 * another one to keep our memory usage under the limit defined by the
 	 * zfs_metaslab_mem_limit tunable.
 	 */
 	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
 	    msp->ms_group->mg_class) {
 		metaslab_potentially_evict(msp->ms_group->mg_class);
 	}
 
 	int error = metaslab_load_impl(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * This can happen if a metaslab is selected for eviction (in
 	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
 	 * metaslab_class_evict_old).
 	 */
 	if (!msp->ms_loaded)
 		return;
 
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_unload_time = gethrtime();
 
 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 
 	if (msp->ms_group != NULL) {
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (multilist_link_active(&msp->ms_class_txg_node))
 			multilist_sublist_remove(mls, msp);
 		multilist_sublist_unlock(mls);
 
 		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, weight %llx, "
 		    "selected txg %llu (%llu ms ago), alloc_txg %llu, "
 		    "loaded %llu ms ago, max_size %llu",
 		    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)msp->ms_weight,
 		    (u_longlong_t)msp->ms_selected_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_selected_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_alloc_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_load_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_max_size);
 	}
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
 	 * map (as it is now not loaded). We want unload metaslabs to always
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
 	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
 	 * and the sorting.
 	 */
 	if (msp->ms_group != NULL)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
 /*
  * We want to optimize the memory use of the per-metaslab range
  * trees. To do this, we store the segments in the range trees in
  * units of sectors, zero-indexing from the start of the metaslab. If
  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
  * the ranges using two uint32_ts, rather than two uint64_ts.
  */
 range_seg_type_t
 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
     uint64_t *start, uint64_t *shift)
 {
 	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
 	    !zfs_metaslab_force_large_segs) {
 		*shift = vdev->vdev_ashift;
 		*start = msp->ms_start;
 		return (RANGE_SEG32);
 	} else {
 		*shift = 0;
 		*start = 0;
 		return (RANGE_SEG64);
 	}
 }
 
 void
 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	msp->ms_selected_txg = txg;
 	msp->ms_selected_time = gethrtime();
 	multilist_sublist_insert_tail(mls, msp);
 	multilist_sublist_unlock(mls);
 }
 
 void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 
 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 	ASSERT(vd->vdev_ms_count != 0);
 
 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 	    vdev_deflated_space(vd, space_delta));
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
     uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&ms->ms_class_txg_node);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;
 
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops->vdev_op_metaslab_init != NULL)
 		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it. For
 	 * readonly pools there is no need to open the space map object.
 	 *
 	 * Note:
 	 * When called from vdev_expand(), we can't call into the DMU as
 	 * we are holding the spa_config_lock as a writer and we would
 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
 	 * that case, the object parameter is zero though, so we won't
 	 * call into the DMU.
 	 */
 	if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
 	    !spa->spa_read_spacemaps)) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
 	uint64_t shift, start;
 	range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 
 	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_SIZE; t++) {
 		ms->ms_allocating[t] = range_tree_create(NULL, type,
 		    NULL, start, shift);
 	}
 	ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
 		    start, shift);
 	}
 	ms->ms_checkpointing =
 	    range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_unflushed_allocs =
 	    range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 	ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
 	    type, mrap, start, shift);
 
 	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 * The metaslab's weight will also be initialized when we sync
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
 		metaslab_space_update(vd, mg->mg_class,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 static void
 metaslab_fini_flush_data(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (metaslab_unflushed_txg(msp) == 0) {
 		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
 		    ==, NULL);
 		return;
 	}
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
 	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
 	    metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
 metaslab_unflushed_changes_memused(metaslab_t *ms)
 {
 	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
 	    range_tree_numsegs(ms->ms_unflushed_frees)) *
 	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 
 	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 
 	/*
 	 * If this metaslab hasn't been through metaslab_sync_done() yet its
 	 * space hasn't been accounted for in its vdev and doesn't need to be
 	 * subtracted.
 	 */
 	if (!msp->ms_new) {
 		metaslab_space_update(vd, mg->mg_class,
 		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	}
 	space_map_close(msp->ms_sm);
 	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
 
 	range_tree_destroy(msp->ms_allocatable);
 	range_tree_destroy(msp->ms_freeing);
 	range_tree_destroy(msp->ms_freed);
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_destroy(msp->ms_unflushed_allocs);
 	range_tree_destroy(msp->ms_checkpointing);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 	range_tree_destroy(msp->ms_unflushed_frees);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_allocating[t]);
 	}
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_destroy(msp->ms_defer[t]);
 	}
 	ASSERT0(msp->ms_deferspace);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
 	range_tree_vacate(msp->ms_trim, NULL, NULL);
 	range_tree_destroy(msp->ms_trim);
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 #define	FRAGMENTATION_TABLE_SIZE	17
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmentation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments. A 10% change in fragmentation equates to approximately
  * double the number of segments.
  *
  * This table defines 0% fragmented space using 16MB segments. Testing has
  * shown that segments that are greater than or equal to 16MB do not suffer
  * from drastic performance problems. Using this value, we derive the rest
  * of the table. Since the fragmentation value is never stored on disk, it
  * is possible to change these calculations in the future.
  */
 static const int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 	100,	/* 512B	*/
 	100,	/* 1K	*/
 	98,	/* 2K	*/
 	95,	/* 4K	*/
 	90,	/* 8K	*/
 	80,	/* 16K	*/
 	70,	/* 32K	*/
 	60,	/* 64K	*/
 	50,	/* 128K	*/
 	40,	/* 256K	*/
 	30,	/* 512K	*/
 	20,	/* 1M	*/
 	15,	/* 2M	*/
 	10,	/* 4M	*/
 	5,	/* 8M	*/
 	0	/* 16M	*/
 };
 
 /*
  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
  * been upgraded and does not support this metric. Otherwise, the return
  * value should be in the range [0, 100].
  */
 static void
 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled) {
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL) {
 		msp->ms_fragmentation = 0;
 		return;
 	}
 
 	/*
 	 * If this metaslab's space map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		/*
 		 * If we've reached the final dirty txg, then we must
 		 * be shutting down the pool. We don't want to dirty
 		 * any data past this point so skip setting the condense
 		 * flag. We can retry this action the next time the pool
 		 * is imported. We also skip marking this metaslab for
 		 * condensing if the caller has explicitly set nodirty.
 		 */
 		if (!nodirty &&
 		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
 			    (u_longlong_t)msp->ms_id,
 			    (u_longlong_t)vd->vdev_id);
 		}
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 
 	msp->ms_fragmentation = fragmentation;
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - metaslab_allocated_space(msp);
 
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	WEIGHT_SET_SPACEBASED(weight);
 	return (weight);
 }
 
 /*
  * Return the weight of the specified metaslab, according to the segment-based
  * weighting algorithm. The metaslab must be loaded. This function can
  * be called within a sync pass since it relies only on the metaslab's
  * range tree which is always accurate when the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight_from_range_tree(metaslab_t *msp)
 {
 	uint64_t weight = 0;
 	uint32_t segments = 0;
 
 	ASSERT(msp->ms_loaded);
 
 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 	    i--) {
 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
 		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
 		 * and must be downgraded so that all values fit within the
 		 * space map's histogram. This allows us to compare loaded
 		 * vs. unloaded metaslabs to determine which metaslab is
 		 * considered "best".
 		 */
 		if (i > max_idx)
 			continue;
 
 		if (segments != 0) {
 			WEIGHT_SET_COUNT(weight, segments);
 			WEIGHT_SET_INDEX(weight, i);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Calculate the weight based on the on-disk histogram. Should be applied
  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
  * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(!msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(space_map_object(sm), !=, 0);
 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * Create a joint histogram from all the segments that have made
 	 * it to the metaslab's space map histogram, that are not yet
 	 * available for allocation because they are still in the freeing
 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 	 * these segments from the space map's histogram to get a more
 	 * accurate weight.
 	 */
 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 		deferspace_histogram[i] += msp->ms_synchist[i];
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
 		}
 	}
 
 	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 		    deferspace_histogram[i]);
 		uint64_t count =
 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 		if (count != 0) {
 			WEIGHT_SET_COUNT(weight, count);
 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Compute a segment-based weight for the specified metaslab. The weight
  * is determined by highest bucket in the histogram. The information
  * for the highest bucket is encoded into the weight value.
  */
 static uint64_t
 metaslab_segment_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	uint64_t weight = 0;
 	uint8_t shift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The metaslab is completely free.
 	 */
 	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		if (idx < max_idx) {
 			WEIGHT_SET_COUNT(weight, 1ULL);
 			WEIGHT_SET_INDEX(weight, idx);
 		} else {
 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 			WEIGHT_SET_INDEX(weight, max_idx);
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 		return (weight);
 	}
 
 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
 	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
 	 * determine the weight. Otherwise, we rely on the space map information
 	 * to generate the weight.
 	 */
 	if (msp->ms_loaded) {
 		weight = metaslab_weight_from_range_tree(msp);
 	} else {
 		weight = metaslab_weight_from_spacemap(msp);
 	}
 
 	/*
 	 * If the metaslab was active the last time we calculated its weight
 	 * then keep it active. We want to consume the entire region that
 	 * is associated with this weight.
 	 */
 	if (msp->ms_activation_weight != 0 && weight != 0)
 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 	return (weight);
 }
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
  * metaslab is loaded, then we can determine if the desired allocation
  * can be satisfied by looking at the size of the maximum free segment
  * on that metaslab. Otherwise, we make our decision based on the metaslab's
  * weight. For segment-based weighting we can determine the maximum
  * allocation based on the index encoded in its value. For space-based
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 static boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
 	/*
 	 * This case will usually but not always get caught by the checks below;
 	 * metaslabs can be loaded by various means, including the trim and
 	 * initialize code. Once that happens, without this check they are
 	 * allocatable even before they finish their first txg sync.
 	 */
 	if (unlikely(msp->ms_new))
 		return (B_FALSE);
 
 	/*
 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
 	 * set), and we should use the fast check as long as we're not in
 	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
 	 * seconds since the metaslab was unloaded.
 	 */
 	if (msp->ms_loaded ||
 	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
 	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
 		 * Since the asize might be in the middle of the range, we
 		 * should attempt the allocation if asize < 2^(i+1).
 		 */
 		should_allocate = (asize <
 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 	} else {
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
 
 	return (should_allocate);
 }
 
 static uint64_t
 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t weight;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	metaslab_set_fragmentation(msp, nodirty);
 
 	/*
 	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
 	 * has been added back into the free tree. If the metaslab is
 	 * unloaded, we check if there's a larger free segment in the
 	 * unflushed frees. This is a lower bound on the largest allocatable
 	 * segment size. Coalescing of adjacent entries may reveal larger
 	 * allocatable segments, but we aren't aware of those until loading
 	 * the space map into a range tree.
 	 */
 	if (msp->ms_loaded) {
 		msp->ms_max_size = metaslab_largest_allocatable(msp);
 	} else {
 		msp->ms_max_size = MAX(msp->ms_max_size,
 		    metaslab_largest_unflushed_free(msp));
 	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
 	 */
 	if (zfs_metaslab_segment_weight_enabled &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 	    sizeof (space_map_phys_t))) {
 		weight = metaslab_segment_weight(msp);
 	} else {
 		weight = metaslab_space_weight(msp);
 	}
 	return (weight);
 }
 
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
 	    metaslab_weight(msp, B_FALSE) | was_active);
 }
 
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
 	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
 		ASSERT0(msp->ms_activation_weight);
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(mg, msp, msp->ms_weight |
 		    activation_weight);
 		return (0);
 	}
 
 	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    &mga->mga_primary : &mga->mga_secondary);
 
 	mutex_enter(&mg->mg_lock);
 	if (*mspp != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
 	*mspp = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 
 	ASSERT0(msp->ms_activation_weight);
 	msp->ms_activation_weight = msp->ms_weight;
 	metaslab_group_sort_impl(mg, msp,
 	    msp->ms_weight | activation_weight);
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The current metaslab is already activated for us so there
 	 * is nothing to do. Already activated though, doesn't mean
 	 * that this metaslab is activated for our allocator nor our
 	 * requested activation weight. The metaslab could have started
 	 * as an active one for our allocator but changed allocators
 	 * while we were waiting to grab its ms_lock or we stole it
 	 * [see find_valid_metaslab()]. This means that there is a
 	 * possibility of passivating a metaslab of another allocator
 	 * or from a different activation mask, from this thread.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		ASSERT(msp->ms_loaded);
 		return (0);
 	}
 
 	int error = metaslab_load(msp);
 	if (error != 0) {
 		metaslab_group_sort(msp->ms_group, msp, 0);
 		return (error);
 	}
 
 	/*
 	 * When entering metaslab_load() we may have dropped the
 	 * ms_lock because we were loading this metaslab, or we
 	 * were waiting for another thread to load it for us. In
 	 * that scenario, we recheck the weight of the metaslab
 	 * to see if it was activated by another thread.
 	 *
 	 * If the metaslab was activated for another allocator or
 	 * it was activated with a different activation weight (e.g.
 	 * we wanted to make it a primary but it was activated as
 	 * secondary) we return error (EBUSY).
 	 *
 	 * If the metaslab was activated for the same allocator
 	 * and requested activation mask, skip activating it.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		if (msp->ms_allocator != allocator)
 			return (EBUSY);
 
 		if ((msp->ms_weight & activation_weight) == 0)
 			return (SET_ERROR(EBUSY));
 
 		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
 		    msp->ms_primary);
 		return (0);
 	}
 
 	/*
 	 * If the metaslab has literally 0 space, it will have weight 0. In
 	 * that case, don't bother activating it. This can happen if the
 	 * metaslab had space during find_valid_metaslab, but another thread
 	 * loaded it and used all that space while we were waiting to grab the
 	 * lock.
 	 */
 	if (msp->ms_weight == 0) {
 		ASSERT0(range_tree_space(msp->ms_allocatable));
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 	    allocator, activation_weight)) != 0) {
 		return (error);
 	}
 
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
 	ASSERT3S(0, <=, msp->ms_allocator);
 	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 	if (msp->ms_primary) {
 		ASSERT3P(mga->mga_primary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mga->mga_primary = NULL;
 	} else {
 		ASSERT3P(mga->mga_secondary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		mga->mga_secondary = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
 
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 	    size >= SPA_MINBLOCKSIZE ||
 	    range_tree_space(msp->ms_allocatable) == 0);
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	ASSERT(msp->ms_activation_weight != 0);
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 }
 
 /*
  * Segment-based metaslabs are activated once and remain active until
  * we either fail an allocation attempt (similar to space-based metaslabs)
  * or have exhausted the free space in zfs_metaslab_switch_threshold
  * buckets since the metaslab was activated. This function checks to see
  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
  * metaslab and passivates it proactively. This will allow us to select a
  * metaslab with a larger contiguous region, if any, remaining within this
  * metaslab group. If we're in sync pass > 1, then we continue using this
  * metaslab so that we don't dirty more block and cause more sync passes.
  */
 static void
 metaslab_segment_may_passivate(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * Since we are in the middle of a sync pass, the most accurate
 	 * information that is accessible to us is the in-core range tree
 	 * histogram; calculate the new weight based on that information.
 	 */
 	uint64_t weight = metaslab_weight_from_range_tree(msp);
 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 	int current_idx = WEIGHT_GET_INDEX(weight);
 
 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 		metaslab_passivate(msp, weight);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	spa_t *spa = mc->mc_spa;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	(void) metaslab_load(msp);
 	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 
 	/*
 	 * Load the next potential metaslabs
 	 */
 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 		ASSERT3P(msp->ms_group, ==, mg);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			continue;
 		}
 
 		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
 		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
 		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance for
  * inefficiency. We would like to use the following criteria to make our
  * decision:
  *
  * 1. Do not condense if the size of the space map object would dramatically
  *    increase as a result of writing out the free space range tree.
  *
  * 2. Condense if the on on-disk space map representation is at least
  *    zfs_condense_pct/100 times the size of the optimal representation
  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
  * 3. Do not condense if the on-disk size of the space map does not actually
  *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
 	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
 	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed
  * by the entries of the free range tree (ms_allocatable). The condensed
  * spacemap contains all the entries of previous TXGs (including those in
  * the pool-wide log spacemaps; thus this is effectively a superset of
  * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_sm != NULL);
 
 	/*
 	 * In order to condense the space map, we need to change it so it
 	 * only describes which segments are currently allocated and free.
 	 *
 	 * All the current free space resides in the ms_allocatable, all
 	 * the ms_defer trees, and all the ms_allocating trees. We ignore
 	 * ms_freed because it is empty because we're in sync pass 1. We
 	 * ignore ms_freeing because these changes are not yet reflected
 	 * in the spacemap (they will be written later this txg).
 	 *
 	 * So to truncate the space map to represent all the entries of
 	 * previous TXGs we do the following:
 	 *
 	 * 1] We create a range tree (condense tree) that is 100% empty.
 	 * 2] We add to it all segments found in the ms_defer trees
 	 *    as those segments are marked as free in the original space
 	 *    map. We do the same with the ms_allocating trees for the same
 	 *    reason. Adding these segments should be a relatively
 	 *    inexpensive operation since we expect these trees to have a
 	 *    small number of nodes.
 	 * 3] We vacate any unflushed allocs, since they are not frees we
 	 *    need to add to the condense tree. Then we vacate any
 	 *    unflushed frees as they should already be part of ms_allocatable.
 	 * 4] At this point, we would ideally like to add all segments
 	 *    in the ms_allocatable tree from the condense tree. This way
 	 *    we would write all the entries of the condense tree as the
 	 *    condensed space map, which would only contain freed
 	 *    segments with everything else assumed to be allocated.
 	 *
 	 *    Doing so can be prohibitively expensive as ms_allocatable can
 	 *    be large, and therefore computationally expensive to add to
 	 *    the condense_tree. Instead we first sync out an entry marking
 	 *    everything as allocated, then the condense_tree and then the
 	 *    ms_allocatable, in the condensed space map. While this is not
 	 *    optimal, it is typically close to optimal and more importantly
 	 *    much cheaper to compute.
 	 *
 	 * 5] Finally, as both of the unflushed trees were written to our
 	 *    new and condensed metaslab space map, we basically flushed
 	 *    all the unflushed changes to disk, thus we call
 	 *    metaslab_flush_update().
 	 */
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %llu, forcing condense=%s",
 	    (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)range_tree_numsegs(msp->ms_allocatable),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	range_seg_type_t type;
 	uint64_t shift, start;
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);
 
 	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_add, condense_tree);
 	}
 
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_add, condense_tree);
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing other
 	 * consumers to change it's content. Set the metaslab's ms_condensing
 	 * flag to ensure that allocations on this metaslab do not occur
 	 * while we're in the middle of committing it to disk. This is only
 	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	uint64_t object = space_map_object(msp->ms_sm);
 	space_map_truncate(sm,
 	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 
 	/*
 	 * space_map_truncate() may have reallocated the spacemap object.
 	 * If so, update the vdev_ms_array.
 	 */
 	if (space_map_object(msp->ms_sm) != object) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(spa->spa_meta_objset,
 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 
 	/*
 	 * Note:
 	 * When the log space map feature is enabled, each space map will
 	 * always have ALLOCS followed by FREES for each sync pass. This is
 	 * typically true even when the log space map feature is disabled,
 	 * except from the case where a metaslab goes through metaslab_sync()
 	 * and gets condensed. In that case the metaslab's space map will have
 	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
 	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
 	    shift);
 	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 	range_tree_vacate(tmp_tree, NULL, NULL);
 	range_tree_destroy(tmp_tree);
 	mutex_enter(&msp->ms_lock);
 
 	msp->ms_condensing = B_FALSE;
 	metaslab_flush_update(msp, tx);
 }
 
 static void
 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, B_TRUE);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_increment_current_mscount(spa);
 	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
 }
 
 void
 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
 	/* update metaslab's position in our flushing tree */
 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
 	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, dirty);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	/* update metaslab counts of spa_log_sm_t nodes */
 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 	spa_log_sm_increment_current_mscount(spa);
 
 	/* update log space map summary */
 	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
 	    ms_prev_flushed_dirty);
 	spa_log_summary_add_flushed_metaslab(spa, dirty);
 
 	/* cleanup obsolete logs if any */
 	spa_cleanup_old_sm_logs(spa, tx);
 }
 
 /*
  * Called when the metaslab has been flushed (its own spacemap now reflects
  * all the contents of the pool-wide spacemap log). Updates the metaslab's
  * metadata and any pool-wide related log space map data (e.g. summary,
  * obsolete logs, etc..) to reflect that.
  */
 static void
 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 
 	/*
 	 * Just because a metaslab got flushed, that doesn't mean that
 	 * it will pass through metaslab_sync_done(). Thus, make sure to
 	 * update ms_synced_length here in case it doesn't.
 	 */
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	/*
 	 * We may end up here from metaslab_condense() without the
 	 * feature being active. In that case this is a no-op.
 	 */
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
 	    metaslab_unflushed_txg(msp) == 0)
 		return;
 
 	metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
 
 	/*
 	 * There is nothing wrong with flushing the same metaslab twice, as
 	 * this codepath should work on that case. However, the current
 	 * flushing scheme makes sure to avoid this situation as we would be
 	 * making all these calls without having anything meaningful to write
 	 * to disk. We assert this behavior here.
 	 */
 	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
 
 	/*
 	 * We can not flush while loading, because then we would
 	 * not load the ms_unflushed_{allocs,frees}.
 	 */
 	if (msp->ms_loading)
 		return (B_FALSE);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	/*
 	 * Metaslab condensing is effectively flushing. Therefore if the
 	 * metaslab can be condensed we can just condense it instead of
 	 * flushing it.
 	 *
 	 * Note that metaslab_condense() does call metaslab_flush_update()
 	 * so we can just return immediately after condensing. We also
 	 * don't need to care about setting ms_flushing or broadcasting
 	 * ms_flush_cv, even if we temporarily drop the ms_lock in
 	 * metaslab_condense(), as the metaslab is already loaded.
 	 */
 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_group_t *mg = msp->ms_group;
 
 		/*
 		 * For all histogram operations below refer to the
 		 * comments of metaslab_sync() where we follow a
 		 * similar procedure.
 		 */
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		metaslab_group_histogram_remove(mg, msp);
 
 		metaslab_condense(msp, tx);
 
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 		ASSERT(range_tree_is_empty(msp->ms_freed));
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 		metaslab_aux_histograms_update(msp);
 
 		metaslab_group_histogram_add(mg, msp);
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 
 		/*
 		 * Since we recreated the histogram (and potentially
 		 * the ms_sm too while condensing) ensure that the
 		 * weight is updated too because we are not guaranteed
 		 * that this metaslab is dirty and will go through
 		 * metaslab_sync_done().
 		 */
 		metaslab_recalculate_weight_and_sort(msp);
 		return (B_TRUE);
 	}
 
 	msp->ms_flushing = B_TRUE;
 	uint64_t sm_len_before = space_map_length(msp->ms_sm);
 
 	mutex_exit(&msp->ms_lock);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
 	    SM_NO_VDEVID, tx);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
 	    SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 
 	uint64_t sm_len_after = space_map_length(msp->ms_sm);
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
 		    "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
 		    spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs),
 		    (u_longlong_t)range_tree_space(msp->ms_unflushed_frees),
 		    (u_longlong_t)(sm_len_after - sm_len_before));
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	metaslab_flush_update(msp, tx);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	msp->ms_flushing = B_FALSE;
 	cv_broadcast(&msp->ms_flush_cv);
 	return (B_TRUE);
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (msp->ms_new) {
 		ASSERT0(range_tree_space(alloctree));
 		ASSERT0(range_tree_space(msp->ms_freeing));
 		ASSERT0(range_tree_space(msp->ms_freed));
 		ASSERT0(range_tree_space(msp->ms_checkpointing));
 		ASSERT0(range_tree_space(msp->ms_trim));
 		return;
 	}
 
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
 	 * forced to condense, it's loaded and we're not beyond the final
 	 * dirty txg, we need to let it through. Not condensing beyond the
 	 * final dirty txg prevents an issue where metaslabs that need to be
 	 * condensed but were loaded for other reasons could cause a panic
 	 * here. By only checking the txg in that branch of the conditional,
 	 * we preserve the utility of the VERIFY statements in all other
 	 * cases.
 	 */
 	if (range_tree_is_empty(alloctree) &&
 	    range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted &&
 	    txg <= spa_final_dirty_txg(spa)))
 		return;
 
 
 	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
 	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
 	 * could call into the DMU, because the DMU can call down to
 	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
 	 * concurrently, and it is locked out by the ms_sync_lock.
 	 * Note that the ms_lock is insufficient for this, because it
 	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	/*
 	 * Generate a log space map if one doesn't exist already.
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object = space_map_alloc(mos,
 		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 		    zfs_metaslab_sm_blksz_with_log :
 		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
 		ASSERT(msp->ms_sm != NULL);
 
 		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
 		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
 		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * We save the space map object as an entry in vdev_top_zap
 		 * so it can be retrieved when the pool is reopened after an
 		 * export or through zdb.
 		 */
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (new_object), 1, &new_object, tx));
 	}
 
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
 	    metaslab_should_condense(msp))
 		metaslab_condense(msp, tx);
 
 	/*
 	 * We'll be going to disk to sync our space accounting, thus we
 	 * drop the ms_lock during that time so allocations coming from
 	 * open-context (ZIL) for future TXGs do not block.
 	 */
 	mutex_exit(&msp->ms_lock);
 	space_map_t *log_sm = spa_syncing_log_sm(spa);
 	if (log_sm != NULL) {
 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 		if (metaslab_unflushed_txg(msp) == 0)
 			metaslab_unflushed_add(msp, tx);
 		else if (!metaslab_unflushed_dirty(msp))
 			metaslab_unflushed_bump(msp, tx, B_TRUE);
 
 		space_map_write(log_sm, alloctree, SM_ALLOC,
 		    vd->vdev_id, tx);
 		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
 		    vd->vdev_id, tx);
 		mutex_enter(&msp->ms_lock);
 
 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 		    metaslab_unflushed_changes_memused(msp));
 		spa->spa_unflushed_stats.sus_memused -=
 		    metaslab_unflushed_changes_memused(msp);
 		range_tree_remove_xor_add(alloctree,
 		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
 		range_tree_remove_xor_add(msp->ms_freeing,
 		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(msp);
 	} else {
 		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
 	msp->ms_allocated_space += range_tree_space(alloctree);
 	ASSERT3U(msp->ms_allocated_space, >=,
 	    range_tree_space(msp->ms_freeing));
 	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
 
 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
 		 * ms_lock while writing to the checkpoint space map, for the
 		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 
 		spa->spa_checkpoint_info.sci_dspace +=
 		    range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 		    -space_map_allocated(vd->vdev_checkpoint_sm));
 
 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
 		 * any free space that has already been processed, plus
 		 * any deferred space. This allows the on-disk histogram
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
 		 * added back into the in-core free tree yet. This will
 		 * ensure that we don't end up with a space map histogram
 		 * that is completely empty unless the metaslab is fully
 		 * allocated.
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 	}
 
 	/*
 	 * Always add the free space from this sync pass to the space
 	 * map histogram. We want to make sure that the on-disk histogram
 	 * accounts for all free space. If the space map is not loaded,
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 	metaslab_aux_histograms_update(msp);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
 	 *
 	 * Keep in mind that even if we are currently using a log spacemap
 	 * we want current frees to end up in the ms_allocatable (but not
 	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		range_tree_vacate(msp->ms_freeing,
 		    range_tree_add, msp->ms_freed);
 	}
 	msp->ms_allocated_this_txg += range_tree_space(alloctree);
 	range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 	    & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
 	/*
 	 * Verify that the space map object ID has been recorded in the
 	 * vdev_ms_array.
 	 */
 	uint64_t object;
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
 	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
 	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
 
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
 
 static void
 metaslab_evict(metaslab_t *msp, uint64_t txg)
 {
 	if (!msp->ms_loaded || msp->ms_disabled != 0)
 		return;
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		VERIFY0(range_tree_space(
 		    msp->ms_allocating[(txg + t) & TXG_MASK]));
 	}
 	if (msp->ms_allocator != -1)
 		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 
 	if (!metaslab_debug_unload)
 		metaslab_unload(msp);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	if (msp->ms_new) {
 		/* this is a new metaslab, add its capacity to the vdev */
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 
 		/* there should be no allocations nor frees at this point */
 		VERIFY0(msp->ms_allocated_this_txg);
 		VERIFY0(range_tree_space(msp->ms_freed));
 	}
 
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
 	    vd->vdev_rz_expanding) {
 		defer_allowed = B_FALSE;
 	}
 
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    range_tree_space(msp->ms_freed);
 
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
 	}
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
 	if (spa_syncing_log_sm(spa) == NULL) {
 		/*
 		 * If there's a metaslab_load() in progress and we don't have
 		 * a log space map, it means that we probably wrote to the
 		 * metaslab's space map. If this is the case, we need to
 		 * make sure that we wait for the load to complete so that we
 		 * have a consistent view at the in-core side of the metaslab.
 		 */
 		metaslab_load_wait(msp);
 	} else {
 		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
 	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
 	 * periodically consumed by the vdev_autotrim_thread() which issues
 	 * trims for all ranges and then vacates the tree.  The ms_trim tree
 	 * can be discarded at any time with the sole consequence of recent
 	 * frees not being trimmed.
 	 */
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
 		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
 		if (!defer_allowed) {
 			range_tree_walk(msp->ms_freed, range_tree_add,
 			    msp->ms_trim);
 		}
 	} else {
 		range_tree_vacate(msp->ms_trim, NULL, NULL);
 	}
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
 	 * the defer_tree -- this is safe to do because we've
 	 * just emptied out the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
 		range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
 		range_tree_vacate(msp->ms_freed,
 		    msp->ms_loaded ? range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
 
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 	metaslab_aux_histograms_update_done(msp, defer_allowed);
 
 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
 		mutex_enter(&mg->mg_lock);
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
 
 	/*
 	 * Re-sort metaslab within its group now that we've adjusted
 	 * its allocatable space.
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 	metaslab_group_alloc_update(mg);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
 	 * Preload the next potential metaslabs but only on active
 	 * metaslab groups. We can get into a state where the metaslab
 	 * is no longer active since we dirty metaslabs as we remove a
 	 * a device, thus potentially making the metaslab group eligible
 	 * for preloading.
 	 */
 	if (mg->mg_activation_count > 0) {
 		metaslab_group_preload(mg);
 	}
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 }
 
 /*
  * When writing a ditto block (i.e. more than one DVA for a given BP) on
  * the same vdev as an existing DVA of this BP, then try to allocate it
  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
  */
 static boolean_t
 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t dva_ms_id;
 
 	if (DVA_GET_ASIZE(dva) == 0)
 		return (B_TRUE);
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (B_TRUE);
 
 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 
 	return (msp->ms_id != dva_ms_id);
 }
 
 /*
  * ==========================================================================
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
 
 /*
  * Add an allocation trace element to the allocation tracing list.
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
     int allocator)
 {
 	metaslab_alloc_trace_t *mat;
 
 	if (!metaslab_trace_enabled)
 		return;
 
 	/*
 	 * When the tracing list reaches its maximum we remove
 	 * the second element in the list before adding a new one.
 	 * By removing the second element we preserve the original
 	 * entry as a clue to what allocations steps have already been
 	 * performed.
 	 */
 	if (zal->zal_size == metaslab_trace_max_entries) {
 		metaslab_alloc_trace_t *mat_next;
 #ifdef ZFS_DEBUG
 		panic("too many entries in allocation list");
 #endif
 		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 	}
 
 	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 	list_link_init(&mat->mat_list_node);
 	mat->mat_mg = mg;
 	mat->mat_msp = msp;
 	mat->mat_size = psize;
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
 	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
 
 	/*
 	 * The list is part of the zio so locking is not required. Only
 	 * a single thread will perform allocations for a given zio.
 	 */
 	list_insert_tail(&zal->zal_list, mat);
 	zal->zal_size++;
 
 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 }
 
 void
 metaslab_trace_init(zio_alloc_list_t *zal)
 {
 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
 	zal->zal_size = 0;
 }
 
 void
 metaslab_trace_fini(zio_alloc_list_t *zal)
 {
 	metaslab_alloc_trace_t *mat;
 
 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 static void
 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
 }
 
 static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	metaslab_class_allocator_t *mca =
 	    &mg->mg_class->mc_allocator[allocator];
 	uint64_t max = mg->mg_max_alloc_queue_depth;
 	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
 	while (cur < max) {
 		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
 		    cur, cur + 1) == cur) {
 			atomic_inc_64(&mca->mca_alloc_max_slots);
 			return;
 		}
 		cur = mga->mga_cur_max_alloc_queue_depth;
 	}
 }
 
 void
 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, const void *tag,
     int flags, int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
 	if (io_complete)
 		metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, const void *tag,
     int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
 	}
 #endif
 }
 
 static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
 	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 	VERIFY0(msp->ms_new);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
 		metaslab_group_t *mg = msp->ms_group;
 		vdev_t *vd = mg->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
 		range_tree_clear(msp->ms_trim, start, size);
 
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 		msp->ms_allocating_total += size;
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
 	/*
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
 /*
  * Find the metaslab with the highest weight that is less than what we've
  * already tried.  In the common case, this means that we will examine each
  * metaslab at most once. Note that concurrent callers could reorder metaslabs
  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
  * activated by another thread, and we fail to allocate from the metaslab we
  * have selected, we may not try the newly-activated metaslab, and instead
  * activate another metaslab.  This is not optimal, but generally does not cause
  * any problems (a possible exception being if every metaslab is completely full
  * except for the newly-activated metaslab which we fail to examine).
  */
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
     boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	metaslab_t *msp = avl_find(t, search, &idx);
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
 	uint_t tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 
 		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
 			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
 			return (NULL);
 		}
 		tries++;
 
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
 		}
 
 		/*
 		 * If the selected metaslab is condensing or disabled, or
 		 * hasn't gone through a metaslab_sync_done(), then skip it.
 		 */
 		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
 		/*
 		 * If we're activating as primary, this is our first allocation
 		 * from this disk, so we don't need to check how close we are.
 		 * If the metaslab under consideration was already active,
 		 * we're getting desperate enough to steal another allocator's
 		 * metaslab, so we still don't care about distances.
 		 */
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 			break;
 
 		for (i = 0; i < d; i++) {
 			if (want_unique &&
 			    !metaslab_is_unique(msp, &dva[i]))
 				break;  /* try another metaslab */
 		}
 		if (i == d)
 			break;
 	}
 
 	if (msp != NULL) {
 		search->ms_weight = msp->ms_weight;
 		search->ms_start = msp->ms_start + 1;
 		search->ms_allocator = msp->ms_allocator;
 		search->ms_primary = msp->ms_primary;
 	}
 	return (msp);
 }
 
 static void
 metaslab_active_mask_verify(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
 		return;
 
 	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(!msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY3S(msp->ms_allocator, ==, -1);
 		return;
 	}
 }
 
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 
 	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_CLAIM;
 			break;
 		}
 	}
 
 	/*
 	 * If we don't have enough metaslabs active to fill the entire array, we
 	 * just use the 0th slot.
 	 */
 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
 		allocator = 0;
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
 	/*
 	 * At the end of the metaslab tree are the already-active metaslabs,
 	 * first the primaries, then the secondaries. When we resume searching
 	 * through the tree, we need to consider ms_allocator and ms_primary so
 	 * we start in the location right after where we left off, and don't
 	 * accidentally loop forever considering the same metaslabs.
 	 */
 	search->ms_allocator = -1;
 	search->ms_primary = B_TRUE;
 	for (;;) {
 		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mga->mga_primary != NULL) {
 			msp = mga->mga_primary;
 
 			/*
 			 * Even though we don't hold the ms_lock for the
 			 * primary metaslab, those fields should not
 			 * change while we hold the mg_lock. Thus it is
 			 * safe to make assertions on them.
 			 */
 			ASSERT(msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mga->mga_secondary != NULL) {
 			msp = mga->mga_secondary;
 
 			/*
 			 * See comment above about the similar assertions
 			 * for the primary metaslab.
 			 */
 			ASSERT(!msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
 			    want_unique, asize, allocator, try_hard, zal,
 			    search, &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
 		mutex_enter(&msp->ms_lock);
 
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE3(ms__activation__attempt,
 		    metaslab_t *, msp, uint64_t, activation_weight,
 		    boolean_t, was_active);
 #endif
 
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
 		 * active status first to see if we need to set_selected_txg
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If the metaslab was activated for another allocator
 		 * while we were waiting in the ms_lock above, or it's
 		 * a primary and we're seeking a secondary (or vice versa),
 		 * we go back and select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			ASSERT(msp->ms_loaded);
 			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
 			    msp->ms_allocator != -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * This metaslab was used for claiming regions allocated
 		 * by the ZIL during pool import. Once these regions are
 		 * claimed we don't need to keep the CLAIM bit set
 		 * anymore. Passivate this metaslab to zero its activation
 		 * mask.
 		 */
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
 			ASSERT(msp->ms_loaded);
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		metaslab_set_selected_txg(msp, txg);
 
 		int activation_error =
 		    metaslab_activate(msp, allocator, activation_weight);
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * If the metaslab was activated by another thread for
 		 * another allocator or activation_weight (EBUSY), or it
 		 * failed because another metaslab was assigned as primary
 		 * for this allocator (EEXIST) we continue using this
 		 * metaslab for our allocation, rather than going on to a
 		 * worse metaslab (we waited for that metaslab to be loaded
 		 * after all).
 		 *
 		 * If the activation failed due to an I/O error or ENOSPC we
 		 * skip to the next metaslab.
 		 */
 		boolean_t activated;
 		if (activation_error == 0) {
 			activated = B_TRUE;
 		} else if (activation_error == EBUSY ||
 		    activation_error == EEXIST) {
 			activated = B_FALSE;
 		} else {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
 		 * the metaslab is now loaded so metaslab_should_allocate()
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again
 		 * as we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_DISABLED, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
 			if (activated)
 				metaslab_segment_may_passivate(msp);
 			break;
 		}
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
 		    uint64_t, asize);
 #endif
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
 		 * selector.
 		 *
 		 * For space-based metaslabs, we use the maximum block size.
 		 * This information is only available when the metaslab
 		 * is loaded and is more accurate than the generic free
 		 * space weight that was calculated by metaslab_weight().
 		 * This information allows us to quickly compare the maximum
 		 * available allocation in the metaslab to the allocation
 		 * size being requested.
 		 *
 		 * For segment-based metaslabs, determine the new weight
 		 * based on the highest bucket in the range tree. We
 		 * explicitly use the loaded segment weight (i.e. the range
 		 * tree histogram) since it contains the space that is
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
 
 		if (activated) {
 			metaslab_passivate(msp, weight);
 		} else {
 			/*
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
 			 *
 			 * Note that we could attempt to use something like
 			 * metaslab_recalculate_weight_and_sort() that
 			 * retains the activation mask here. That function
 			 * uses metaslab_weight() to set the weight though
 			 * which is not as accurate as the calculations
 			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);
 		}
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * We have just failed an allocation attempt, check
 		 * that metaslab_should_allocate() agrees. Otherwise,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
 		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
 	mutex_exit(&msp->ms_lock);
 	kmem_free(search, sizeof (*search));
 	return (offset);
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
     int allocator, boolean_t try_hard)
 {
 	uint64_t offset;
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 	    dva, d, allocator, try_hard);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
 		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
 			 * the minimum gang block size so it must be out of
 			 * space. We must notify the allocation throttle
 			 * to start skipping allocation attempts to this
 			 * metaslab group until more space becomes available.
 			 * Note: this failure cannot be caused by the
 			 * allocation throttle since the allocation throttle
 			 * is only responsible for skipping devices and
 			 * not failing block allocations.
 			 */
 			mg->mg_no_free_space = B_TRUE;
 		}
 	}
 	mg->mg_allocations++;
 	mutex_exit(&mg->mg_lock);
 	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 * This will result in more split blocks when using device removal,
 	 * and a large number of split blocks coupled with ztest-induced
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
 	if (psize >= metaslab_force_ganging &&
 	    metaslab_force_ganging_pct > 0 &&
 	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mca_rotor or mca_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  That
 	 * way, we can hope for locality in vdev_cache, plus it makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 
 		/*
 		 * It's possible the vdev we're using as the hint no
 		 * longer exists or its mg has been closed (e.g. by
 		 * device removal).  Consult the rotor when
 		 * all else fails.
 		 */
 		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vdev_get_mg(vd, mc);
 
 			if (flags & METASLAB_HINTBP_AVOID)
 				mg = mg->mg_next;
 		} else {
 			mg = mca->mca_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
 	}
 
 	/*
 	 * If the hint put us into the wrong metaslab class, or into a
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mca->mca_rotor;
 
 	rotor = mg;
 top:
 	do {
 		boolean_t allocatable;
 
 		ASSERT(mg->mg_activation_count == 1);
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
 		if (try_hard) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
 
 		/*
 		 * Determine if the selected metaslab group is eligible
 		 * for allocations. If we're ganging then don't allow
 		 * this metaslab group to skip allocations since that would
 		 * inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
 			    flags, psize, allocator, d);
 		}
 
 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}
 
 		/*
 		 * Avoid writing single-copy data to an unhealthy,
 		 * non-redundant vdev, unless we've already tried all
 		 * other vdevs.
 		 */
 		if (vd->vdev_state < VDEV_STATE_HEALTHY &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
 		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		/*
 		 * If we don't need to try hard, then require that the
 		 * block be on a different metaslab from any other DVAs
 		 * in this BP (unique=true).  If we are trying hard, then
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
 		    !try_hard, dva, d, allocator, try_hard);
 
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 *
 			 * Bias is also used to compensate for unequally
 			 * sized vdevs so that space is allocated fairly.
 			 */
 			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vs_free = vs->vs_space - vs->vs_alloc;
 				int64_t mc_free = mc->mc_space - mc->mc_alloc;
 				int64_t ratio;
 
 				/*
 				 * Calculate how much more or less we should
 				 * try to allocate from this device during
 				 * this iteration around the rotor.
 				 *
 				 * This basically introduces a zero-centered
 				 * bias towards the devices with the most
 				 * free space, while compensating for vdev
 				 * size differences.
 				 *
 				 * Examples:
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 16M/128M
 				 *  ratio(V1) = 100% ratio(V2) = 100%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/128M
 				 *  ratio(V1) = 127% ratio(V2) =  72%
 				 *
 				 *  vdev V1 = 16M/128M
 				 *  vdev V2 = 64M/512M
 				 *  ratio(V1) =  40% ratio(V2) = 160%
 				 */
 				ratio = (vs_free * mc->mc_alloc_groups * 100) /
 				    (mc_free + 1);
 				mg->mg_bias = ((ratio - 100) *
 				    (int64_t)mg->mg_aliquot) / 100;
 			} else if (!metaslab_bias_enabled) {
 				mg->mg_bias = 0;
 			}
 
 			if ((flags & METASLAB_ZIL) ||
 			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mca->mca_rotor = mg->mg_next;
 				mca->mca_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d],
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
 next:
 		mca->mca_rotor = mg->mg_next;
 		mca->mca_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
 	 * If we haven't tried hard, perhaps do so now.
 	 */
 	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
 	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
 	    psize <= 1 << spa->spa_min_ashift)) {
 		METASLABSTAT_BUMP(metaslabstat_try_hard);
 		try_hard = B_TRUE;
 		goto top;
 	}
 
 	memset(&dva[d], 0, sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
     boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
 
 	mutex_enter(&msp->ms_lock);
 	if (range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing)) {
 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 	}
 
 	if (checkpoint) {
 		ASSERT(spa_has_checkpoint(spa));
 		range_tree_add(msp->ms_checkpointing, offset, asize);
 	} else {
 		range_tree_add(msp->ms_freeing, offset, asize);
 	}
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
 		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
 		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
     boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
 		 * we complete the removal, we first change the vdev to be
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
 		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_free_impl_cb, &checkpoint);
 	} else {
 		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
 typedef struct remap_blkptr_cb_arg {
 	blkptr_t *rbca_bp;
 	spa_remap_cb_t rbca_cb;
 	vdev_t *rbca_remap_vd;
 	uint64_t rbca_remap_offset;
 	void *rbca_cb_arg;
 } remap_blkptr_cb_arg_t;
 
 static void
 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	remap_blkptr_cb_arg_t *rbca = arg;
 	blkptr_t *bp = rbca->rbca_bp;
 
 	/* We can not remap split blocks. */
 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 		return;
 	ASSERT0(inner_offset);
 
 	if (rbca->rbca_cb != NULL) {
 		/*
 		 * At this point we know that we are not handling split
 		 * blocks and we invoke the callback on the previous
 		 * vdev which must be indirect.
 		 */
 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 
 		/* set up remap_blkptr_cb_arg for the next call */
 		rbca->rbca_remap_vd = vd;
 		rbca->rbca_remap_offset = offset;
 	}
 
 	/*
 	 * The phys birth time is that of dva[0].  This ensures that we know
 	 * when each dva was written, so that resilver can determine which
 	 * blocks need to be scrubbed (i.e. those written during the time
 	 * the vdev was offline).  It also ensures that the key used in
 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 	 * we didn't change the phys_birth, a lookup in the ARC for a
 	 * remapped BP could find the data that was previously stored at
 	 * this vdev + offset.
 	 */
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
 	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 }
 
 /*
  * If the block pointer contains any indirect DVAs, modify them to refer to
  * concrete DVAs.  Note that this will sometimes not be possible, leaving
  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
  * segments in the mapping (i.e. it is a "split block").
  *
  * If the BP was remapped, calls the callback on the original dva (note the
  * callback can be called multiple times if the original indirect DVA refers
  * to another indirect DVA, etc).
  *
  * Returns TRUE if the BP was remapped.
  */
 boolean_t
 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 {
 	remap_blkptr_cb_arg_t rbca;
 
 	if (!zfs_remap_blkptr_enable)
 		return (B_FALSE);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 		return (B_FALSE);
 
 	/*
 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
 	 */
 	if (BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	/*
 	 * Gang blocks can not be remapped, because
 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 	 * the BP used to read the gang block header (GBH) being the same
 	 * as the DVA[0] that we allocated for the GBH.
 	 */
 	if (BP_IS_GANG(bp))
 		return (B_FALSE);
 
 	/*
 	 * Embedded BP's have no DVA to remap.
 	 */
 	if (BP_GET_NDVAS(bp) < 1)
 		return (B_FALSE);
 
 	/*
 	 * Note: we only remap dva[0].  If we remapped other dvas, we
 	 * would no longer know what their phys birth txg is.
 	 */
 	dva_t *dva = &bp->blk_dva[0];
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops->vdev_op_remap == NULL)
 		return (B_FALSE);
 
 	rbca.rbca_bp = bp;
 	rbca.rbca_cb = callback;
 	rbca.rbca_remap_vd = vd;
 	rbca.rbca_remap_offset = offset;
 	rbca.rbca_cb_arg = arg;
 
 	/*
 	 * remap_blkptr_cb() will be called in order for each level of
 	 * indirection, until a concrete vdev is reached or a split block is
 	 * encountered. old_vd and old_offset are updated within the callback
 	 * as we go from the one indirect vdev to the next one (either concrete
 	 * or indirect again) in that order.
 	 */
 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 
 	/* Check if the DVA wasn't remapped because it is a split block */
 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Undo the allocation of a DVA which happened in the given transaction group.
  */
 void
 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	metaslab_t *msp;
 	vdev_t *vd;
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset,
 		    (u_longlong_t)size);
 		return;
 	}
 
 	ASSERT(!vd->vdev_removing);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 	msp->ms_allocating_total -= size;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Free the block represented by the given DVA.
  */
 void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (DVA_GET_GANG(dva)) {
 		size = vdev_gang_header_asize(vd);
 	}
 
 	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
  * Reserve some allocation slots. The reservation system must be called
  * before we call into the allocator. If there aren't any available slots
  * then the I/O will be throttled until an I/O completes and its slots are
  * freed up. The function returns true if it was successful in placing
  * the reservation.
  */
 boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	uint64_t max = mca->mca_alloc_max_slots;
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
 	    zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
 		/*
 		 * The potential race between _count() and _add() is covered
 		 * by the allocator lock in most cases, or irrelevant due to
 		 * GANG_ALLOCATION() or METASLAB_MUST_RESERVE set in others.
 		 * But even if we assume some other non-existing scenario, the
 		 * worst that can happen is few more I/Os get to allocation
 		 * earlier, that is not a problem.
 		 *
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
 		zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 void
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     int allocator, zio_t *zio)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
 }
 
 static int
 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
     uint64_t txg)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3P(vd->vdev_ms, !=, NULL);
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 		if (error == EBUSY) {
 			ASSERT(msp->ms_loaded);
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 			error = 0;
 		}
 	}
 
 	if (error == 0 &&
 	    !range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	range_tree_remove(msp->ms_allocatable, offset, size);
 	range_tree_clear(msp->ms_trim, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (!multilist_link_active(&msp->ms_class_txg_node)) {
 			msp->ms_selected_txg = txg;
 			multilist_sublist_insert_head(mls, msp);
 		}
 		multilist_sublist_unlock(mls);
 
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
 		msp->ms_allocating_total += size;
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 typedef struct metaslab_claim_cb_arg_t {
 	uint64_t	mcca_txg;
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 		    size, mcca_arg->mcca_txg);
 	}
 }
 
 int
 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 {
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		metaslab_claim_cb_arg_t arg;
 
 		/*
 		 * Only zdb(8) can claim on indirect vdevs.  This is used
 		 * to detect leaks of mapped space (that are not accounted
 		 * for in the obsolete counts, spacemap, or bpobj).
 		 */
 		ASSERT(!spa_writeable(vd->vdev_spa));
 		arg.mcca_error = 0;
 		arg.mcca_txg = txg;
 
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_claim_impl_cb, &arg);
 
 		if (arg.mcca_error == 0) {
 			arg.mcca_error = metaslab_claim_concrete(vd,
 			    offset, size, txg);
 		}
 		return (arg.mcca_error);
 	} else {
 		return (metaslab_claim_concrete(vd, offset, size, txg));
 	}
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	return (metaslab_claim_impl(vd, offset, size, txg));
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
     zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
 	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
 	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
 		/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), zio, flags,
 				    allocator, B_FALSE);
 				memset(&dva[d], 0, sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		} else {
 			/*
 			 * Update the metaslab group's queue depth
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, 0);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
 	 * the blocks that we free that are part of the checkpoint won't be
 	 * reused until the checkpoint is discarded or we revert to it.
 	 *
 	 * The checkpoint flag is passed down the metaslab_free code path
 	 * and is set whenever we want to add a block to the checkpoint's
 	 * accounting. That is, we "checkpoint" blocks that existed at the
 	 * time the checkpoint was created and are therefore referenced by
 	 * the checkpointed uberblock.
 	 *
 	 * Note that, we don't checkpoint any blocks if the current
 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
 	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
 		 * there is no way it was created in the current txg.
 		 */
 		ASSERT(!now);
 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
 		checkpoint = B_TRUE;
 	}
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
 			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_claim_dva(spa, &dva[d], txg);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner, (void) arg;
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	metaslab_check_free_impl(vd, offset, size);
 }
 
 static void
 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	metaslab_t *msp;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_check_free_impl_cb, NULL);
 		return;
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded) {
 		range_tree_verify_not_present(msp->ms_allocatable,
 		    offset, size);
 	}
 
 	/*
 	 * Check all segments that currently exist in the freeing pipeline.
 	 *
 	 * It would intuitively make sense to also check the current allocating
 	 * tree since metaslab_unalloc_dva() exists for extents that are
 	 * allocated and freed in the same sync pass within the same txg.
 	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
 	 * segment but then we free part of it within the same txg
 	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
 	 * current allocating tree.
 	 */
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
 	range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 
 		if (DVA_GET_GANG(&bp->blk_dva[i]))
 			size = vdev_gang_header_asize(vd);
 
 		ASSERT3P(vd, !=, NULL);
 
 		metaslab_check_free_impl(vd, offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
 static void
 metaslab_group_disable_wait(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	while (mg->mg_disabled_updating) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 }
 
 static void
 metaslab_group_disabled_increment(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	ASSERT(mg->mg_disabled_updating);
 
 	while (mg->mg_ms_disabled >= max_disabled_ms) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 	mg->mg_ms_disabled++;
 	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
 }
 
 /*
  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
  * We must also track how many metaslabs are currently disabled within a
  * metaslab group and limit them to prevent allocation failures from
  * occurring because all metaslabs are disabled.
  */
 void
 metaslab_disable(metaslab_t *msp)
 {
 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
 	metaslab_group_t *mg = msp->ms_group;
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 
 	/*
 	 * To keep an accurate count of how many threads have disabled
 	 * a specific metaslab group, we only allow one thread to mark
 	 * the metaslab group at a time. This ensures that the value of
 	 * ms_disabled will be accurate when we decide to mark a metaslab
 	 * group as disabled. To do this we force all other threads
 	 * to wait till the metaslab's mg_disabled_updating flag is no
 	 * longer set.
 	 */
 	metaslab_group_disable_wait(mg);
 	mg->mg_disabled_updating = B_TRUE;
 	if (msp->ms_disabled == 0) {
 		metaslab_group_disabled_increment(mg);
 	}
 	mutex_enter(&msp->ms_lock);
 	msp->ms_disabled++;
 	mutex_exit(&msp->ms_lock);
 
 	mg->mg_disabled_updating = B_FALSE;
 	cv_broadcast(&mg->mg_ms_disabled_cv);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	/*
 	 * Wait for the outstanding IO to be synced to prevent newly
 	 * allocated blocks from being overwritten.  This used by
 	 * initialize and TRIM which are modifying unallocated space.
 	 */
 	if (sync)
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 	mutex_enter(&msp->ms_lock);
 	if (--msp->ms_disabled == 0) {
 		mg->mg_ms_disabled--;
 		cv_broadcast(&mg->mg_ms_disabled_cv);
 		if (unload)
 			metaslab_unload(msp);
 	}
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
 {
 	ms->ms_unflushed_dirty = dirty;
 }
 
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
 	vdev_t *vd = ms->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	metaslab_unflushed_phys_t entry = {
 		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
 	};
 	uint64_t entry_size = sizeof (entry);
 	uint64_t entry_offset = ms->ms_id * entry_size;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 	    &object);
 	if (err == ENOENT) {
 		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
 		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 		    &object, tx));
 	} else {
 		VERIFY0(err);
 	}
 
 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
 	    &entry, tx);
 }
 
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
 	ms->ms_unflushed_txg = txg;
 	metaslab_update_ondisk_flush_data(ms, tx);
 }
 
 boolean_t
 metaslab_unflushed_dirty(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_dirty);
 }
 
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_txg);
 }
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 	"Allocation granularity (a.k.a. stripe size)");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
 	"Load all metaslabs when pool is first opened");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 	"Prevent metaslabs from being unloaded");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
 	"Max number of metaslabs per group to preload");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 	"Delay in milliseconds after metaslab was last used before unloading");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be free to make it "
 	"eligible for allocation");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be considered eligible "
 	"for allocations unless all metaslab groups within the metaslab class "
 	"have also crossed this threshold");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
 	ZMOD_RW,
 	"Use the fragmentation metric to prefer less fragmented metaslabs");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
 	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 	"Prefer metaslabs with lower LBAs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
 	"Enable metaslab group biasing");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 	ZMOD_RW, "Enable segment-based metaslab selection");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
 	"Blocks larger than this size are sometimes forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
 	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 	ZMOD_RW, "Try hard to allocate before ganging");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
 	param_set_active_allocator, param_get_charp, ZMOD_RW,
 	"SPA active allocator");
-/* END CSTYLED */
diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c
index 71122542758d..493884cf04c4 100644
--- a/module/zfs/mmp.c
+++ b/module/zfs/mmp.c
@@ -1,749 +1,747 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
  */
 
 #include <sys/abd.h>
 #include <sys/mmp.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/time.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/callb.h>
 
 /*
  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
  * or opening a pool on more than one host at a time.  In particular, it
  * prevents "zpool import -f" on a host from succeeding while the pool is
  * already imported on another host.  There are many other ways in which a
  * device could be used by two hosts for different purposes at the same time
  * resulting in pool damage.  This implementation does not attempt to detect
  * those cases.
  *
  * MMP operates by ensuring there are frequent visible changes on disk (a
  * "heartbeat") at all times.  And by altering the import process to check
  * for these changes and failing the import when they are detected.  This
  * functionality is enabled by setting the 'multihost' pool property to on.
  *
  * Uberblocks written by the txg_sync thread always go into the first
  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
  * They are used to hold uberblocks which are exactly the same as the last
  * synced uberblock except that the ub_timestamp and mmp_config are frequently
  * updated.  Like all other uberblocks, the slot is written with an embedded
  * checksum, and slots with invalid checksums are ignored.  This provides the
  * "heartbeat", with no risk of overwriting good uberblocks that must be
  * preserved, e.g. previous txgs and associated block pointers.
  *
  * Three optional fields are added to uberblock structure; ub_mmp_magic,
  * ub_mmp_config, and ub_mmp_delay.  The ub_mmp_magic value allows zfs to tell
  * whether the other ub_mmp_* fields are valid.  The ub_mmp_config field tells
  * the importing host the settings of zfs_multihost_interval and
  * zfs_multihost_fail_intervals on the host which last had (or currently has)
  * the pool imported.  These determine how long a host must wait to detect
  * activity in the pool, before concluding the pool is not in use.  The
  * mmp_delay field is a decaying average of the amount of time between
  * completion of successive MMP writes, in nanoseconds.  It indicates whether
  * MMP is enabled.
  *
  * During import an activity test may now be performed to determine if
  * the pool is in use.  The activity test is typically required if the
  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
  * POOL_STATE_ACTIVE, and the pool is not a root pool.
  *
  * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
  * ub_mmp_magic is valid, sequence number from ub_mmp_config).  It then waits
  * some time, and finds the "best" uberblock again.  If any of the mentioned
  * fields have different values in the newly read uberblock, the pool is in use
  * by another host and the import fails.  In order to assure the accuracy of the
  * activity test, the default values result in an activity test duration of 20x
  * the mmp write interval.
  *
  * The duration of the "zpool import" activity test depends on the information
  * available in the "best" uberblock:
  *
  * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
  *    ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
  *
  *    In this case, a weak guarantee is provided.  Since the host which last had
  *    the pool imported will suspend the pool if no mmp writes land within
  *    fail_intervals * multihost_interval ms, the absence of writes during that
  *    time means either the pool is not imported, or it is imported but the pool
  *    is suspended and no further writes will occur.
  *
  *    Note that resuming the suspended pool on the remote host would invalidate
  *    this guarantee, and so it is not allowed.
  *
  *    The factor of 2 provides a conservative safety factor and derives from
  *    MMP_IMPORT_SAFETY_FACTOR;
  *
  * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
  *    (ub_mmp_config.multihost_interval + ub_mmp_delay) *
  *        zfs_multihost_import_intervals
  *
  *    In this case no guarantee can provided.  However, as long as some devices
  *    are healthy and connected, it is likely that at least one write will land
  *    within (multihost_interval + mmp_delay) because multihost_interval is
  *    enough time for a write to be attempted to each leaf vdev, and mmp_delay
  *    is enough for one to land, based on past delays.  Multiplying by
  *    zfs_multihost_import_intervals provides a conservative safety factor.
  *
  * 3) If uberblock was written by zfs-0.7:
  *    (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
  *
  *    The same logic as case #2 applies, but we do not know remote tunables.
  *
  *    We use the local value for zfs_multihost_interval because the original MMP
  *    did not record this value in the uberblock.
  *
  *    ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
  *    has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
  *    that.  We will have waited enough time for zfs_multihost_import_intervals
  *    writes to be issued and all but one to land.
  *
  *    single device pool example delays
  *
  *    import_delay = (1 + 1) * 20   =  40s #defaults, no I/O delay
  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  *    import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
  *                                          no I/O delay
  *    100 device pool example delays
  *
  *    import_delay = (1 + .01) * 20 =  20s #defaults, no I/O delay
  *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
  *    import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
  *                                          no I/O delay
  *
  * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
  *    zfs_multihost_import_intervals * zfs_multihost_interval
  *
  *    In this case local tunables are used.  By default this product = 10s, long
  *    enough for a pool with any activity at all to write at least one
  *    uberblock.  No guarantee can be provided.
  *
  * Additionally, the duration is then extended by a random 25% to attempt to to
  * detect simultaneous imports.  For example, if both partner hosts are rebooted
  * at the same time and automatically attempt to import the pool.
  */
 
 /*
  * Used to control the frequency of mmp writes which are performed when the
  * 'multihost' pool property is on.  This is one factor used to determine the
  * length of the activity check during import.
  *
  * On average an mmp write will be issued for each leaf vdev every
  * zfs_multihost_interval milliseconds.  In practice, the observed period can
  * vary with the I/O load and this observed value is the ub_mmp_delay which is
  * stored in the uberblock.  The minimum allowed value is 100 ms.
  */
 uint64_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
 
 /*
  * Used to control the duration of the activity test on import.  Smaller values
  * of zfs_multihost_import_intervals will reduce the import time but increase
  * the risk of failing to detect an active pool.  The total activity check time
  * is never allowed to drop below one second.  A value of 0 is ignored and
  * treated as if it was set to 1.
  */
 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
 
 /*
  * Controls the behavior of the pool when mmp write failures or delays are
  * detected.
  *
  * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
  * ignored.  The failures will still be reported to the ZED which depending on
  * its configuration may take action such as suspending the pool or taking a
  * device offline.
  *
  * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
  * without a successful mmp write.  This guarantees the activity test will see
  * mmp writes if the pool is imported.  A value of 1 is ignored and treated as
  * if it was set to 2, because a single leaf vdev pool will issue a write once
  * per multihost_interval and thus any variation in latency would cause the
  * pool to be suspended.
  */
 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
 
 static const void *const mmp_tag = "mmp_write_uberblock";
 static __attribute__((noreturn)) void mmp_thread(void *arg);
 
 void
 mmp_init(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	mmp->mmp_kstat_id = 1;
 }
 
 void
 mmp_fini(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_destroy(&mmp->mmp_thread_lock);
 	cv_destroy(&mmp->mmp_thread_cv);
 	mutex_destroy(&mmp->mmp_io_lock);
 }
 
 static void
 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
 {
 	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
 	mutex_enter(&mmp->mmp_thread_lock);
 }
 
 static void
 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
 {
 	ASSERT(*mpp != NULL);
 	*mpp = NULL;
 	cv_broadcast(&mmp->mmp_thread_cv);
 	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
 }
 
 void
 mmp_thread_start(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	if (spa_writeable(spa)) {
 		mutex_enter(&mmp->mmp_thread_lock);
 		if (!mmp->mmp_thread) {
 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
 			    spa, 0, &p0, TS_RUN, defclsyspri);
 			zfs_dbgmsg("MMP thread started pool '%s' "
 			    "gethrtime %llu", spa_name(spa), gethrtime());
 		}
 		mutex_exit(&mmp->mmp_thread_lock);
 	}
 }
 
 void
 mmp_thread_stop(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_thread_lock);
 	mmp->mmp_thread_exiting = 1;
 	cv_broadcast(&mmp->mmp_thread_cv);
 
 	while (mmp->mmp_thread) {
 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
 	}
 	mutex_exit(&mmp->mmp_thread_lock);
 	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
 	    spa_name(spa), gethrtime());
 
 	ASSERT(mmp->mmp_thread == NULL);
 	mmp->mmp_thread_exiting = 0;
 }
 
 typedef enum mmp_vdev_state_flag {
 	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
 	MMP_FAIL_WRITE_PENDING	= (1 << 1),
 } mmp_vdev_state_flag_t;
 
 /*
  * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
  * mmp write (if so a new write will also likely block).  If there is no usable
  * leaf, a nonzero error value is returned. The error value returned is a bit
  * field.
  *
  * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
  *                          outstanding MMP write.
  * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
  */
 
 static int
 mmp_next_leaf(spa_t *spa)
 {
 	vdev_t *leaf;
 	vdev_t *starting_leaf;
 	int fail_mask = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
 	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
 	ASSERT(!list_is_empty(&spa->spa_leaf_list));
 
 	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
 		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
 		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
 	}
 
 	leaf = spa->spa_mmp.mmp_last_leaf;
 	if (leaf == NULL)
 		leaf = list_head(&spa->spa_leaf_list);
 	starting_leaf = leaf;
 
 	do {
 		leaf = list_next(&spa->spa_leaf_list, leaf);
 		if (leaf == NULL) {
 			leaf = list_head(&spa->spa_leaf_list);
 			ASSERT3P(leaf, !=, NULL);
 		}
 
 		/*
 		 * We skip unwritable, offline, detached, and dRAID spare
 		 * devices as they are either not legal targets or the write
 		 * may fail or not be seen by other hosts.  Skipped dRAID
 		 * spares can never be written so the fail mask is not set.
 		 */
 		if (!vdev_writeable(leaf) || leaf->vdev_offline ||
 		    leaf->vdev_detached) {
 			fail_mask |= MMP_FAIL_NOT_WRITABLE;
 		} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
 			continue;
 		} else if (leaf->vdev_mmp_pending != 0) {
 			fail_mask |= MMP_FAIL_WRITE_PENDING;
 		} else {
 			spa->spa_mmp.mmp_last_leaf = leaf;
 			return (0);
 		}
 	} while (leaf != starting_leaf);
 
 	ASSERT(fail_mask);
 
 	return (fail_mask);
 }
 
 /*
  * MMP writes are issued on a fixed schedule, but may complete at variable,
  * much longer, intervals.  The mmp_delay captures long periods between
  * successful writes for any reason, including disk latency, scheduling delays,
  * etc.
  *
  * The mmp_delay is usually calculated as a decaying average, but if the latest
  * delay is higher we do not average it, so that we do not hide sudden spikes
  * which the importing host must wait for.
  *
  * If writes are occurring frequently, such as due to a high rate of txg syncs,
  * the mmp_delay could become very small.  Since those short delays depend on
  * activity we cannot count on, we never allow mmp_delay to get lower than rate
  * expected if only mmp_thread writes occur.
  *
  * If an mmp write was skipped or fails, and we have already waited longer than
  * mmp_delay, we need to update it so the next write reflects the longer delay.
  *
  * Do not set mmp_delay if the multihost property is not on, so as not to
  * trigger an activity check on import.
  */
 static void
 mmp_delay_update(spa_t *spa, boolean_t write_completed)
 {
 	mmp_thread_t *mts = &spa->spa_mmp;
 	hrtime_t delay = gethrtime() - mts->mmp_last_write;
 
 	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
 
 	if (spa_multihost(spa) == B_FALSE) {
 		mts->mmp_delay = 0;
 		return;
 	}
 
 	if (delay > mts->mmp_delay)
 		mts->mmp_delay = delay;
 
 	if (write_completed == B_FALSE)
 		return;
 
 	mts->mmp_last_write = gethrtime();
 
 	/*
 	 * strictly less than, in case delay was changed above.
 	 */
 	if (delay < mts->mmp_delay) {
 		hrtime_t min_delay =
 		    MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
 		    MAX(1, vdev_count_leaves(spa));
 		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
 		    min_delay);
 	}
 }
 
 static void
 mmp_write_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	mmp_thread_t *mts = zio->io_private;
 
 	mutex_enter(&mts->mmp_io_lock);
 	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
 	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
 
 	mmp_delay_update(spa, (zio->io_error == 0));
 
 	vd->vdev_mmp_pending = 0;
 	vd->vdev_mmp_kstat_id = 0;
 
 	mutex_exit(&mts->mmp_io_lock);
 	spa_config_exit(spa, SCL_STATE, mmp_tag);
 
 	spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
 	    mmp_write_duration);
 
 	abd_free(zio->io_abd);
 }
 
 /*
  * When the uberblock on-disk is updated by a spa_sync,
  * creating a new "best" uberblock, update the one stored
  * in the mmp thread state, used for mmp writes.
  */
 void
 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_io_lock);
 	mmp->mmp_ub = *ub;
 	mmp->mmp_seq = 1;
 	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 	mmp_delay_update(spa, B_TRUE);
 	mutex_exit(&mmp->mmp_io_lock);
 }
 
 /*
  * Choose a random vdev, label, and MMP block, and write over it
  * with a copy of the last-synced uberblock, whose timestamp
  * has been updated to reflect that the pool is in use.
  */
 static void
 mmp_write_uberblock(spa_t *spa)
 {
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 	mmp_thread_t *mmp = &spa->spa_mmp;
 	uberblock_t *ub;
 	vdev_t *vd = NULL;
 	int label, error;
 	uint64_t offset;
 
 	hrtime_t lock_acquire_time = gethrtime();
 	spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER);
 	lock_acquire_time = gethrtime() - lock_acquire_time;
 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
 		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
 		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
 		    gethrtime());
 
 	mutex_enter(&mmp->mmp_io_lock);
 
 	error = mmp_next_leaf(spa);
 
 	/*
 	 * spa_mmp_history has two types of entries:
 	 * Issued MMP write: records time issued, error status, etc.
 	 * Skipped MMP write: an MMP write could not be issued because no
 	 * suitable leaf vdev was available.  See comment above struct
 	 * spa_mmp_history for details.
 	 */
 
 	if (error) {
 		mmp_delay_update(spa, B_FALSE);
 		if (mmp->mmp_skip_error == error) {
 			spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
 		} else {
 			mmp->mmp_skip_error = error;
 			spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
 			    gethrestime_sec(), mmp->mmp_delay, NULL, 0,
 			    mmp->mmp_kstat_id++, error);
 			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
 			    "gethrtime %llu fail_mask %#x", spa_name(spa),
 			    gethrtime(), error);
 		}
 		mutex_exit(&mmp->mmp_io_lock);
 		spa_config_exit(spa, SCL_STATE, mmp_tag);
 		return;
 	}
 
 	vd = spa->spa_mmp.mmp_last_leaf;
 	if (mmp->mmp_skip_error != 0) {
 		mmp->mmp_skip_error = 0;
 		zfs_dbgmsg("MMP write after skipping due to unavailable "
 		    "leaves, pool '%s' gethrtime %llu leaf %llu",
 		    spa_name(spa), (u_longlong_t)gethrtime(),
 		    (u_longlong_t)vd->vdev_guid);
 	}
 
 	if (mmp->mmp_zio_root == NULL)
 		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
 		    flags | ZIO_FLAG_GODFATHER);
 
 	if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
 		/*
 		 * Want to reset mmp_seq when timestamp advances because after
 		 * an mmp_seq wrap new values will not be chosen by
 		 * uberblock_compare() as the "best".
 		 */
 		mmp->mmp_ub.ub_timestamp = gethrestime_sec();
 		mmp->mmp_seq = 1;
 	}
 
 	ub = &mmp->mmp_ub;
 	ub->ub_mmp_magic = MMP_MAGIC;
 	ub->ub_mmp_delay = mmp->mmp_delay;
 	ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
 	    MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
 	    MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
 	    zfs_multihost_fail_intervals));
 	vd->vdev_mmp_pending = gethrtime();
 	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
 
 	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
 	abd_zero_off(ub_abd, sizeof (uberblock_t),
 	    VDEV_UBERBLOCK_SIZE(vd) - sizeof (uberblock_t));
 
 	mmp->mmp_seq++;
 	mmp->mmp_kstat_id++;
 	mutex_exit(&mmp->mmp_io_lock);
 
 	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
 	    MMP_BLOCKS_PER_LABEL + random_in_range(MMP_BLOCKS_PER_LABEL));
 
 	label = random_in_range(VDEV_LABELS);
 	vdev_label_write(zio, vd, label, ub_abd, offset,
 	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
 	    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	(void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
 	    ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
 
 	zio_nowait(zio);
 }
 
 static __attribute__((noreturn)) void
 mmp_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	mmp_thread_t *mmp = &spa->spa_mmp;
 	boolean_t suspended = spa_suspended(spa);
 	boolean_t multihost = spa_multihost(spa);
 	uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
 	    zfs_multihost_interval));
 	uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
 	    zfs_multihost_fail_intervals);
 	hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
 	boolean_t last_spa_suspended;
 	boolean_t last_spa_multihost;
 	uint64_t last_mmp_interval;
 	uint32_t last_mmp_fail_intervals;
 	hrtime_t last_mmp_fail_ns;
 	callb_cpr_t cpr;
 	int skip_wait = 0;
 
 	mmp_thread_enter(mmp, &cpr);
 
 	/*
 	 * There have been no MMP writes yet.  Setting mmp_last_write here gives
 	 * us one mmp_fail_ns period, which is consistent with the activity
 	 * check duration, to try to land an MMP write before MMP suspends the
 	 * pool (if so configured).
 	 */
 
 	mutex_enter(&mmp->mmp_io_lock);
 	mmp->mmp_last_write = gethrtime();
 	mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
 	mutex_exit(&mmp->mmp_io_lock);
 
 	while (!mmp->mmp_thread_exiting) {
 		hrtime_t next_time = gethrtime() +
 		    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
 		int leaves = MAX(vdev_count_leaves(spa), 1);
 
 		/* Detect changes in tunables or state */
 
 		last_spa_suspended = suspended;
 		last_spa_multihost = multihost;
 		suspended = spa_suspended(spa);
 		multihost = spa_multihost(spa);
 
 		last_mmp_interval = mmp_interval;
 		last_mmp_fail_intervals = mmp_fail_intervals;
 		last_mmp_fail_ns = mmp_fail_ns;
 		mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
 		    zfs_multihost_interval));
 		mmp_fail_intervals = MMP_FAIL_INTVS_OK(
 		    zfs_multihost_fail_intervals);
 
 		/* Smooth so pool is not suspended when reducing tunables */
 		if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
 			mmp_fail_ns = (mmp_fail_ns * 31 +
 			    mmp_fail_intervals * mmp_interval) / 32;
 		} else {
 			mmp_fail_ns = mmp_fail_intervals *
 			    mmp_interval;
 		}
 
 		if (mmp_interval != last_mmp_interval ||
 		    mmp_fail_intervals != last_mmp_fail_intervals) {
 			/*
 			 * We want other hosts to see new tunables as quickly as
 			 * possible.  Write out at higher frequency than usual.
 			 */
 			skip_wait += leaves;
 		}
 
 		if (multihost)
 			next_time = gethrtime() + mmp_interval / leaves;
 
 		if (mmp_fail_ns != last_mmp_fail_ns) {
 			zfs_dbgmsg("MMP interval change pool '%s' "
 			    "gethrtime %llu last_mmp_interval %llu "
 			    "mmp_interval %llu last_mmp_fail_intervals %u "
 			    "mmp_fail_intervals %u mmp_fail_ns %llu "
 			    "skip_wait %d leaves %d next_time %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)last_mmp_interval,
 			    (u_longlong_t)mmp_interval, last_mmp_fail_intervals,
 			    mmp_fail_intervals, (u_longlong_t)mmp_fail_ns,
 			    skip_wait, leaves, (u_longlong_t)next_time);
 		}
 
 		/*
 		 * MMP off => on, or suspended => !suspended:
 		 * No writes occurred recently.  Update mmp_last_write to give
 		 * us some time to try.
 		 */
 		if ((!last_spa_multihost && multihost) ||
 		    (last_spa_suspended && !suspended)) {
 			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
 			    "last_spa_multihost %u multihost %u "
 			    "last_spa_suspended %u suspended %u",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    last_spa_multihost, multihost, last_spa_suspended,
 			    suspended);
 			mutex_enter(&mmp->mmp_io_lock);
 			mmp->mmp_last_write = gethrtime();
 			mmp->mmp_delay = mmp_interval;
 			mutex_exit(&mmp->mmp_io_lock);
 		}
 
 		/*
 		 * MMP on => off:
 		 * mmp_delay == 0 tells importing node to skip activity check.
 		 */
 		if (last_spa_multihost && !multihost) {
 			mutex_enter(&mmp->mmp_io_lock);
 			mmp->mmp_delay = 0;
 			mutex_exit(&mmp->mmp_io_lock);
 		}
 
 		/*
 		 * Suspend the pool if no MMP write has succeeded in over
 		 * mmp_interval * mmp_fail_intervals nanoseconds.
 		 */
 		if (multihost && !suspended && mmp_fail_intervals &&
 		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
 			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
 			    "mmp_last_write %llu mmp_interval %llu "
 			    "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
 			    spa_name(spa), (u_longlong_t)gethrtime(),
 			    (u_longlong_t)mmp->mmp_last_write,
 			    (u_longlong_t)mmp_interval,
 			    (u_longlong_t)mmp_fail_intervals,
 			    (u_longlong_t)mmp_fail_ns,
 			    (u_longlong_t)spa->spa_uberblock.ub_txg);
 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
 			    "succeeded in over %llu ms; suspending pool. "
 			    "Hrtime %llu",
 			    spa_name(spa),
 			    NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
 			    gethrtime());
 			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
 		}
 
 		if (multihost && !suspended)
 			mmp_write_uberblock(spa);
 
 		if (skip_wait > 0) {
 			next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
 			    leaves;
 			skip_wait--;
 		}
 
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
 		    &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
 		    CALLOUT_FLAG_ABSOLUTE);
 		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
 	}
 
 	/* Outstanding writes are allowed to complete. */
 	zio_wait(mmp->mmp_zio_root);
 
 	mmp->mmp_zio_root = NULL;
 	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
 
 	thread_exit();
 }
 
 /*
  * Signal the MMP thread to wake it, when it is sleeping on
  * its cv.  Used when some module parameter has changed and
  * we want the thread to know about it.
  * Only signal if the pool is active and mmp thread is
  * running, otherwise there is no thread to wake.
  */
 static void
 mmp_signal_thread(spa_t *spa)
 {
 	mmp_thread_t *mmp = &spa->spa_mmp;
 
 	mutex_enter(&mmp->mmp_thread_lock);
 	if (mmp->mmp_thread)
 		cv_broadcast(&mmp->mmp_thread_cv);
 	mutex_exit(&mmp->mmp_thread_lock);
 }
 
 void
 mmp_signal_all_threads(void)
 {
 	spa_t *spa = NULL;
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa))) {
 		if (spa->spa_state == POOL_STATE_ACTIVE)
 			mmp_signal_thread(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
 	param_set_multihost_interval, spl_param_get_u64, ZMOD_RW,
 	"Milliseconds between mmp writes to each leaf");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW,
 	"Max allowed period without a successful mmp write");
 
 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW,
 	"Number of zfs_multihost_interval periods to wait for activity");
diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c
index 718bbb34a8d5..0dd7da1aa197 100644
--- a/module/zfs/refcount.c
+++ b/module/zfs/refcount.c
@@ -1,359 +1,357 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2021 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_refcount.h>
 
 #ifdef	ZFS_DEBUG
 /*
  * Reference count tracking is disabled by default.  It's memory requirements
  * are reasonable, however as implemented it consumes a significant amount of
  * cpu time.  Until its performance is improved it should be manually enabled.
  */
 int reference_tracking_enable = B_FALSE;
 static uint_t reference_history = 3; /* tunable */
 
 static kmem_cache_t *reference_cache;
 
 void
 zfs_refcount_init(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_refcount_fini(void)
 {
 	kmem_cache_destroy(reference_cache);
 }
 
 static int
 zfs_refcount_compare(const void *x1, const void *x2)
 {
 	const reference_t *r1 = (const reference_t *)x1;
 	const reference_t *r2 = (const reference_t *)x2;
 
 	int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
 	int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
 	int cmp = cmp1 ? cmp1 : cmp2;
 	return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
 }
 
 void
 zfs_refcount_create(zfs_refcount_t *rc)
 {
 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.a));
 	list_create(&rc->rc_removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.l));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
 	rc->rc_tracked = reference_tracking_enable;
 }
 
 void
 zfs_refcount_create_tracked(zfs_refcount_t *rc)
 {
 	zfs_refcount_create(rc);
 	rc->rc_tracked = B_TRUE;
 }
 
 void
 zfs_refcount_create_untracked(zfs_refcount_t *rc)
 {
 	zfs_refcount_create(rc);
 	rc->rc_tracked = B_FALSE;
 }
 
 void
 zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
 {
 	reference_t *ref;
 	void *cookie = NULL;
 
 	ASSERT3U(rc->rc_count, ==, number);
 	while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
 		kmem_cache_free(reference_cache, ref);
 	avl_destroy(&rc->rc_tree);
 
 	while ((ref = list_remove_head(&rc->rc_removed)))
 		kmem_cache_free(reference_cache, ref);
 	list_destroy(&rc->rc_removed);
 	mutex_destroy(&rc->rc_mtx);
 }
 
 void
 zfs_refcount_destroy(zfs_refcount_t *rc)
 {
 	zfs_refcount_destroy_many(rc, 0);
 }
 
 int
 zfs_refcount_is_zero(zfs_refcount_t *rc)
 {
 	return (zfs_refcount_count(rc) == 0);
 }
 
 int64_t
 zfs_refcount_count(zfs_refcount_t *rc)
 {
 	return (atomic_load_64(&rc->rc_count));
 }
 
 int64_t
 zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	reference_t *ref;
 	int64_t count;
 
 	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, number);
 		ASSERT3U(count, >=, number);
 		return (count);
 	}
 
 	ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 	ref->ref_holder = holder;
 	ref->ref_number = number;
 	ref->ref_search = B_FALSE;
 	mutex_enter(&rc->rc_mtx);
 	avl_add(&rc->rc_tree, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
 
 	return (count);
 }
 
 int64_t
 zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
 {
 	return (zfs_refcount_add_many(rc, 1, holder));
 }
 
 void
 zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_add_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_add(rc, holder);
 }
 
 int64_t
 zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
     const void *holder)
 {
 	reference_t *ref, s;
 	int64_t count;
 
 	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, -number);
 		ASSERT3S(count, >=, 0);
 		return (count);
 	}
 
 	s.ref_holder = holder;
 	s.ref_number = number;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ASSERT3U(rc->rc_count, >=, number);
 	ref = avl_find(&rc->rc_tree, &s, NULL);
 	if (unlikely(ref == NULL)) {
 		panic("No such hold %p on refcount %llx", holder,
 		    (u_longlong_t)(uintptr_t)rc);
 		return (-1);
 	}
 	avl_remove(&rc->rc_tree, ref);
 	if (reference_history > 0) {
 		list_insert_head(&rc->rc_removed, ref);
 		if (rc->rc_removed_count >= reference_history) {
 			ref = list_remove_tail(&rc->rc_removed);
 			kmem_cache_free(reference_cache, ref);
 		} else {
 			rc->rc_removed_count++;
 		}
 	} else {
 		kmem_cache_free(reference_cache, ref);
 	}
 	rc->rc_count -= number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
 	return (count);
 }
 
 int64_t
 zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
 {
 	return (zfs_refcount_remove_many(rc, 1, holder));
 }
 
 void
 zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_remove_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_remove(rc, holder);
 }
 
 void
 zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
 {
 	avl_tree_t tree;
 	list_t removed;
 	reference_t *ref;
 	void *cookie = NULL;
 	uint64_t count;
 	uint_t removed_count;
 
 	avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.a));
 	list_create(&removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.l));
 
 	mutex_enter(&src->rc_mtx);
 	count = src->rc_count;
 	removed_count = src->rc_removed_count;
 	src->rc_count = 0;
 	src->rc_removed_count = 0;
 	avl_swap(&tree, &src->rc_tree);
 	list_move_tail(&removed, &src->rc_removed);
 	mutex_exit(&src->rc_mtx);
 
 	mutex_enter(&dst->rc_mtx);
 	dst->rc_count += count;
 	dst->rc_removed_count += removed_count;
 	if (avl_is_empty(&dst->rc_tree))
 		avl_swap(&dst->rc_tree, &tree);
 	else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
 		avl_add(&dst->rc_tree, ref);
 	list_move_tail(&dst->rc_removed, &removed);
 	mutex_exit(&dst->rc_mtx);
 
 	avl_destroy(&tree);
 	list_destroy(&removed);
 }
 
 void
 zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
     const void *current_holder, const void *new_holder)
 {
 	reference_t *ref, s;
 
 	if (likely(!rc->rc_tracked))
 		return;
 
 	s.ref_holder = current_holder;
 	s.ref_number = number;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ref = avl_find(&rc->rc_tree, &s, NULL);
 	ASSERT(ref);
 	ref->ref_holder = new_holder;
 	avl_update(&rc->rc_tree, ref);
 	mutex_exit(&rc->rc_mtx);
 }
 
 void
 zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
     const void *new_holder)
 {
 	return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder,
 	    new_holder));
 }
 
 /*
  * If tracking is enabled, return true if a reference exists that matches
  * the "holder" tag. If tracking is disabled, then return true if a reference
  * might be held.
  */
 boolean_t
 zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 {
 	reference_t *ref, s;
 	avl_index_t idx;
 	boolean_t res;
 
 	if (likely(!rc->rc_tracked))
 		return (zfs_refcount_count(rc) > 0);
 
 	s.ref_holder = holder;
 	s.ref_number = 0;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ref = avl_find(&rc->rc_tree, &s, &idx);
 	if (likely(ref == NULL))
 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
 	res = ref && ref->ref_holder == holder;
 	mutex_exit(&rc->rc_mtx);
 	return (res);
 }
 
 /*
  * If tracking is enabled, return true if a reference does not exist that
  * matches the "holder" tag. If tracking is disabled, always return true
  * since the reference might not be held.
  */
 boolean_t
 zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
 {
 	reference_t *ref, s;
 	avl_index_t idx;
 	boolean_t res;
 
 	if (likely(!rc->rc_tracked))
 		return (B_TRUE);
 
 	mutex_enter(&rc->rc_mtx);
 	s.ref_holder = holder;
 	s.ref_number = 0;
 	s.ref_search = B_TRUE;
 	ref = avl_find(&rc->rc_tree, &s, &idx);
 	if (likely(ref == NULL))
 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
 	res = ref == NULL || ref->ref_holder != holder;
 	mutex_exit(&rc->rc_mtx);
 	return (res);
 }
 
 EXPORT_SYMBOL(zfs_refcount_create);
 EXPORT_SYMBOL(zfs_refcount_destroy);
 EXPORT_SYMBOL(zfs_refcount_is_zero);
 EXPORT_SYMBOL(zfs_refcount_count);
 EXPORT_SYMBOL(zfs_refcount_add);
 EXPORT_SYMBOL(zfs_refcount_remove);
 EXPORT_SYMBOL(zfs_refcount_held);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW,
 	"Track reference holders to refcount_t objects");
 
 ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW,
 	"Maximum reference holders being tracked");
-/* END CSTYLED */
 #endif	/* ZFS_DEBUG */
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 5a616adb41a2..b83c982c13fd 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1,11072 +1,11066 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/mmp.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
 #include <sys/zvol.h>
 
 #ifdef	_KERNEL
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/callb.h>
 #include <sys/zone.h>
 #include <sys/vmsystm.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 #include <cityhash.h>
 
 /*
  * spa_thread() existed on Illumos as a parent thread for the various worker
  * threads that actually run the pool, as a way to both reference the entire
  * pool work as a single object, and to share properties like scheduling
  * options. It has not yet been adapted to Linux or FreeBSD. This define is
  * used to mark related parts of the code to make things easier for the reader,
  * and to compile this code out. It can be removed when someone implements it,
  * moves it to some Illumos-specific place, or removes it entirely.
  */
 #undef HAVE_SPA_THREAD
 
 /*
  * The "System Duty Cycle" scheduling class is an Illumos feature to help
  * prevent CPU-intensive kernel threads from affecting latency on interactive
  * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
  * gated behind a define. On Illumos SDC depends on spa_thread(), but
  * spa_thread() also has other uses, so this is a separate define.
  */
 #undef HAVE_SYSDC
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
 	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"iss", "iss_h", "int", "int_h"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
  * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for high priority I/Os that
  * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
  * implementation, so separate high priority threads are used there.
  */
 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
 #ifdef illumos
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
 #else
 	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* WRITE */
 #endif
 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
     const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 /*
  * Percentage of all CPUs that can be used by the metaslab preload taskq.
  */
 static uint_t metaslab_preload_pct = 50;
 
 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
 
 #ifdef HAVE_SYSDC
 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
 static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 #endif
 
 #ifdef HAVE_SPA_THREAD
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
 static uint_t	zio_taskq_write_tpq = 16;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
  */
 boolean_t	spa_load_verify_dryrun = B_FALSE;
 
 /*
  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
  * This is used by zdb for spacemaps verification.
  */
 boolean_t	spa_mode_readable_spacemaps = B_FALSE;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * For debugging purposes: print out vdev tree during pool import.
  */
 static int		spa_load_print_vdev_tree = B_FALSE;
 
 /*
  * A non-zero value for zfs_max_missing_tvds means that we allow importing
  * pools with missing top-level vdevs. This is strictly intended for advanced
  * pool recovery cases since missing data is almost inevitable. Pools with
  * missing devices can only be imported read-only for safety reasons, and their
  * fail-mode will be automatically set to "continue".
  *
  * With 1 missing vdev we should be able to import the pool and mount all
  * datasets. User data that was not modified after the missing device has been
  * added should be recoverable. This means that snapshots created prior to the
  * addition of that device should be completely intact.
  *
  * With 2 missing vdevs, some datasets may fail to mount since there are
  * dataset statistics that are stored as regular metadata. Some data might be
  * recoverable if those vdevs were added recently.
  *
  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
  * may be missing entirely. Chances of data recovery are very low. Note that
  * there are also risks of performing an inadvertent rewind as we might be
  * missing all the vdevs with the latest uberblocks.
  */
 uint64_t	zfs_max_missing_tvds = 0;
 
 /*
  * The parameters below are similar to zfs_max_missing_tvds but are only
  * intended for a preliminary open of the pool with an untrusted config which
  * might be incomplete or out-dated.
  *
  * We are more tolerant for pools opened from a cachefile since we could have
  * an out-dated cachefile where a device removal was not registered.
  * We could have set the limit arbitrarily high but in the case where devices
  * are really missing we would want to return the proper error codes; we chose
  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
  * been detected and we want spa_load to return the right error codes.
  */
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
  * Debugging aid that pauses spa_sync() towards the end.
  */
 static const boolean_t	zfs_pause_spa_sync = B_FALSE;
 
 /*
  * Variables to indicate the livelist condense zthr func should wait at certain
  * points for the livelist to be removed - used to test condense/destroy races
  */
 static int zfs_livelist_condense_zthr_pause = 0;
 static int zfs_livelist_condense_sync_pause = 0;
 
 /*
  * Variables to track whether or not condense cancellation has been
  * triggered in testing.
  */
 static int zfs_livelist_condense_sync_cancel = 0;
 static int zfs_livelist_condense_zthr_cancel = 0;
 
 /*
  * Variable to track whether or not extra ALLOC blkptrs were added to a
  * livelist entry while it was being condensed (caused by the way we track
  * remapped blkptrs in dbuf_remap_impl)
  */
 static int zfs_livelist_condense_new_alloc = 0;
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static int
 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl)
 {
 	zpool_prop_t prop = zpool_name_to_prop(propname);
 	zprop_source_t src = ZPROP_SRC_NONE;
 	uint64_t intval;
 	int err;
 
 	/*
 	 * NB: Not all properties lookups via this API require
 	 * the spa props lock, so they must explicitly grab it here.
 	 */
 	switch (prop) {
 	case ZPOOL_PROP_DEDUPCACHED:
 		err = ddt_get_pool_dedup_cached(spa, &intval);
 		if (err != 0)
 			return (SET_ERROR(err));
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	spa_prop_add_list(outnvl, prop, NULL, intval, src);
 
 	return (0);
 }
 
 int
 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
     nvlist_t *outnvl)
 {
 	int err = 0;
 
 	if (props == NULL)
 		return (0);
 
 	for (unsigned int i = 0; i < n_props && err == 0; i++) {
 		err = spa_prop_add(spa, props[i], outnvl);
 	}
 
 	return (err);
 }
 
 /*
  * Add a user property (source=src, propname=propval) to an nvlist.
  */
 static void
 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
     zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t *nv)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	const zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(mc);
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
 
 		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL,
 		    spa->spa_checkpoint_info.sci_dspace, src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == SPA_MODE_READ), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL,
 		    brt_get_used(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL,
 		    brt_get_saved(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL,
 		    brt_get_ratio(spa), src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
 		    ddt_get_ddt_dsize(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL,
 		    spa_get_last_scrubbed_txg(spa), src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_DEFAULT);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
 		spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID,
 		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_compatibility != NULL) {
 		spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY,
 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t *nv)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	dsl_pool_t *dp;
 	int err = 0;
 
 	dp = spa_get_dsl(spa);
 	dsl_pool_config_enter(dp, FTAG);
 	za = zap_attribute_alloc();
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nv);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0)
 		goto out;
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za->za_name)) ==
 		    ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name))
 			continue;
 
 		switch (za->za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za->za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_dataset_t *ds = NULL;
 
 				err = dsl_dataset_hold_obj(dp,
 				    za->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 
 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 			} else {
 				strval = NULL;
 				intval = za->za_first_integer;
 			}
 
 			spa_prop_add_list(nv, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za->za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za->za_name, 1, za->za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za->za_num_integers);
 				break;
 			}
 			if (prop != ZPOOL_PROP_INVAL) {
 				spa_prop_add_list(nv, prop, strval, 0, src);
 			} else {
 				src = ZPROP_SRC_LOCAL;
 				spa_prop_add_user(nv, za->za_name, strval,
 				    src);
 			}
 			kmem_free(strval, za->za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 out:
 	mutex_exit(&spa->spa_props_lock);
 	dsl_pool_config_exit(dp, FTAG);
 	zap_attribute_free(za);
 
 	if (err && err != ENOENT)
 		return (err);
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPOOL_PROP_INVAL:
 			/*
 			 * Sanitize the input.
 			 */
 			if (zfs_prop_user(propname)) {
 				if (strlen(propname) >= ZAP_MAXNAMELEN) {
 					error = SET_ERROR(ENAMETOOLONG);
 					break;
 				}
 
 				if (strlen(fnvpair_value_string(elem)) >=
 				    ZAP_MAXVALUELEN) {
 					error = SET_ERROR(E2BIG);
 					break;
 				}
 			} else if (zpool_prop_feature(propname)) {
 				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (nvpair_value_uint64(elem, &intval) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (intval != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				fname = strchr(propname, '@') + 1;
 				if (zfeature_lookup_name(fname, NULL) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				has_feature = B_TRUE;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
 			error = nvpair_value_uint64(elem, &intval);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 		case ZPOOL_PROP_AUTOTRIM:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_MULTIHOST:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 
 			if (!error) {
 				uint32_t hostid = zone_get_hostid(NULL);
 				if (hostid)
 					spa->spa_hostid = hostid;
 				else
 					error = SET_ERROR(ENOTSUP);
 			}
 
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				error = dmu_objset_hold(strval, FTAG, &os);
 				if (error != 0)
 					break;
 
 				/* Must be ZPL. */
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				if (!isprint(*check)) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = SET_ERROR(E2BIG);
 			break;
 
 		default:
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	(void) nvlist_remove_all(props,
 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	const char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_INVAL &&
 		    zfs_prop_user(nvpair_name(elem))) {
 			need_sync = B_TRUE;
 			break;
 		}
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
 			uint64_t ver = 0;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid __maybe_unused = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		int error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (SET_ERROR(error));
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  *
  * The GUID of the pool will be changed to the value pointed to by guidp.
  * The GUID may not be set to the reserverd value of 0.
  * The new GUID will be generated if guidp is NULL.
  */
 int
 spa_change_guid(spa_t *spa, const uint64_t *guidp)
 {
 	uint64_t guid;
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
 	if (guidp != NULL) {
 		guid = *guidp;
 		if (guid == 0) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (spa_guid_exists(guid, 0)) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 	} else {
 		guid = spa_generate_guid(NULL);
 	}
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		/*
 		 * Clear the kobj flag from all the vdevs to allow
 		 * vdev_cache_process_kobj_evt() to post events to all the
 		 * vdevs since GUID is updated.
 		 */
 		vdev_clear_kobj_evt(spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 out:
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 	int ret;
 
 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	return (TREE_ISIGN(ret));
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
 	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, flags = TASKQ_DYNAMIC;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
 	case ZTI_MODE_SYNC:
 
 		/*
 		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
 		 * not to exceed the number of spa allocators, and align to it.
 		 */
 		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
 		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
 		while (spa->spa_alloc_count % count != 0 &&
 		    spa->spa_alloc_count < count * 2)
 			count--;
 
 		/*
 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
 		 * single taskq may have more threads than 100% of online cpus.
 		 */
 		value = (zio_taskq_batch_pct + count / 2) / count;
 		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	case ZTI_MODE_SCALE:
 		flags |= TASKQ_THREADS_CPU_PCT;
 		/*
 		 * We want more taskqs to reduce lock contention, but we want
 		 * less for better request ordering and CPU utilization.
 		 */
 		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
 		if (zio_taskq_batch_tpq > 0) {
 			count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
 			    zio_taskq_batch_tpq);
 		} else {
 			/*
 			 * Prefer 6 threads per taskq, but no more taskqs
 			 * than threads in them on large systems. For 80%:
 			 *
 			 *                 taskq   taskq   total
 			 * cpus    taskqs  percent threads threads
 			 * ------- ------- ------- ------- -------
 			 * 1       1       80%     1       1
 			 * 2       1       80%     1       1
 			 * 4       1       80%     3       3
 			 * 8       2       40%     3       6
 			 * 16      3       27%     4       12
 			 * 32      5       16%     5       25
 			 * 64      7       11%     7       49
 			 * 128     10      8%      10      100
 			 * 256     14      6%      15      210
 			 */
 			count = 1 + cpus / 6;
 			while (count * count > cpus)
 				count--;
 		}
 		/* Limit each taskq within 100% to not trigger assertion. */
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		value = (zio_taskq_batch_pct + count / 2) / count;
 		break;
 
 	case ZTI_MODE_NULL:
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	ASSERT3U(count, >, 0);
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 		char name[32];
 
 		if (count > 1)
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		else
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 
 #ifdef HAVE_SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
 			 *
 			 * Under Linux and FreeBSD this means incrementing
 			 * the priority value as opposed to platforms like
 			 * illumos where it should be decremented.
 			 *
 			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
 			 * are equal then a difference between them is
 			 * insignificant.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
 #if defined(__linux__)
 				pri++;
 #elif defined(__FreeBSD__)
 				pri += 4;
 #else
 #error "unknown OS"
 #endif
 			}
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT3U(tqs->stqs_count, ==, 0);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 #ifdef _KERNEL
 /*
  * The READ and WRITE rows of zio_taskqs are configurable at module load time
  * by setting zio_taskq_read or zio_taskq_write.
  *
  * Example (the defaults for READ and WRITE)
  *   zio_taskq_read='fixed,1,8 null scale null'
  *   zio_taskq_write='sync null scale null'
  *
  * Each sets the entire row at a time.
  *
  * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
  * of threads per taskq.
  *
  * 'null' can only be set on the high-priority queues (queue selection for
  * high-priority queues will fall back to the regular queue if the high-pri
  * is NULL.
  */
 static const char *const modes[ZTI_NMODES] = {
 	"fixed", "scale", "sync", "null"
 };
 
 /* Parse the incoming config string. Modifies cfg */
 static int
 spa_taskq_param_set(zio_type_t t, char *cfg)
 {
 	int err = 0;
 
 	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
 
 	char *next = cfg, *tok, *c;
 
 	/*
 	 * Parse out each element from the string and fill `row`. The entire
 	 * row has to be set at once, so any errors are flagged by just
 	 * breaking out of this loop early.
 	 */
 	uint_t q;
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		/* `next` is the start of the config */
 		if (next == NULL)
 			break;
 
 		/* Eat up leading space */
 		while (isspace(*next))
 			next++;
 		if (*next == '\0')
 			break;
 
 		/* Mode ends at space or end of string */
 		tok = next;
 		next = strchr(tok, ' ');
 		if (next != NULL) *next++ = '\0';
 
 		/* Parameters start after a comma */
 		c = strchr(tok, ',');
 		if (c != NULL) *c++ = '\0';
 
 		/* Match mode string */
 		uint_t mode;
 		for (mode = 0; mode < ZTI_NMODES; mode++)
 			if (strcmp(tok, modes[mode]) == 0)
 				break;
 		if (mode == ZTI_NMODES)
 			break;
 
 		/* Invalid canary */
 		row[q].zti_mode = ZTI_NMODES;
 
 		/* Per-mode setup */
 		switch (mode) {
 
 		/*
 		 * FIXED is parameterised: number of queues, and number of
 		 * threads per queue.
 		 */
 		case ZTI_MODE_FIXED: {
 			/* No parameters? */
 			if (c == NULL || *c == '\0')
 				break;
 
 			/* Find next parameter */
 			tok = c;
 			c = strchr(tok, ',');
 			if (c == NULL)
 				break;
 
 			/* Take digits and convert */
 			unsigned long long nq;
 			if (!(isdigit(*tok)))
 				break;
 			err = ddi_strtoull(tok, &tok, 10, &nq);
 			/* Must succeed and also end at the next param sep */
 			if (err != 0 || tok != c)
 				break;
 
 			/* Move past the comma */
 			tok++;
 			/* Need another number */
 			if (!(isdigit(*tok)))
 				break;
 			/* Remember start to make sure we moved */
 			c = tok;
 
 			/* Take digits */
 			unsigned long long ntpq;
 			err = ddi_strtoull(tok, &tok, 10, &ntpq);
 			/* Must succeed, and moved forward */
 			if (err != 0 || tok == c || *tok != '\0')
 				break;
 
 			/*
 			 * sanity; zero queues/threads make no sense, and
 			 * 16K is almost certainly more than anyone will ever
 			 * need and avoids silly numbers like UINT32_MAX
 			 */
 			if (nq == 0 || nq >= 16384 ||
 			    ntpq == 0 || ntpq >= 16384)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_SCALE: {
 			const zio_taskq_info_t zti = ZTI_SCALE;
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_SYNC: {
 			const zio_taskq_info_t zti = ZTI_SYNC;
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_NULL: {
 			/*
 			 * Can only null the high-priority queues; the general-
 			 * purpose ones have to exist.
 			 */
 			if (q != ZIO_TASKQ_ISSUE_HIGH &&
 			    q != ZIO_TASKQ_INTERRUPT_HIGH)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_NULL;
 			row[q] = zti;
 			break;
 		}
 
 		default:
 			break;
 		}
 
 		/* Ensure we set a mode */
 		if (row[q].zti_mode == ZTI_NMODES)
 			break;
 	}
 
 	/* Didn't get a full row, fail */
 	if (q < ZIO_TASKQ_TYPES)
 		return (SET_ERROR(EINVAL));
 
 	/* Eat trailing space */
 	if (next != NULL)
 		while (isspace(*next))
 			next++;
 
 	/* If there's anything left over then fail */
 	if (next != NULL && *next != '\0')
 		return (SET_ERROR(EINVAL));
 
 	/* Success! Copy it into the real config */
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
 		zio_taskqs[t][q] = row[q];
 
 	return (0);
 }
 
 static int
 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
 {
 	int pos = 0;
 
 	/* Build paramater string from live config */
 	const char *sep = "";
 	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
 		if (zti->zti_mode == ZTI_MODE_FIXED)
 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
 			    modes[zti->zti_mode], zti->zti_count,
 			    zti->zti_value);
 		else
 			pos += sprintf(&buf[pos], "%s%s", sep,
 			    modes[zti->zti_mode]);
 		sep = " ";
 	}
 
 	if (add_newline)
 		buf[pos++] = '\n';
 	buf[pos] = '\0';
 
 	return (pos);
 }
 
 #ifdef __linux__
 static int
 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
 	kmem_free(cfg, strlen(val)+1);
 	return (-err);
 }
 static int
 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
 }
 
 static int
 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
 	kmem_free(cfg, strlen(val)+1);
 	return (-err);
 }
 static int
 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
 }
 #else
 /*
  * On FreeBSD load-time parameters can be set up before malloc() is available,
  * so we have to do all the parsing work on the stack.
  */
 #define	SPA_TASKQ_PARAM_MAX	(128)
 
 static int
 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
 }
 
 static int
 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
 }
 #endif
 #endif /* _KERNEL */
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself.
  */
 void
 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, zio_t *zio, boolean_t cutinline)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(zio);
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
 	    ZIO_HAS_ALLOCATOR(zio)) {
 		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 
 	taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
 	    &zio->io_tqent);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
 {
 	psetid_t zio_taskq_psrset_bind = PS_NONE;
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 
 #ifdef HAVE_SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif
 
 extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
 	metaslab_ops_t *msp = metaslab_allocator(spa);
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
 	spa->spa_normal_class = metaslab_class_create(spa, msp);
 	spa->spa_log_class = metaslab_class_create(spa, msp);
 	spa->spa_embedded_log_class = metaslab_class_create(spa, msp);
 	spa->spa_special_class = metaslab_class_create(spa, msp);
 	spa->spa_dedup_class = metaslab_class_create(spa, msp);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef HAVE_SPA_THREAD
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif /* HAVE_SPA_THREAD */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list, spa,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_healed,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 
 	spa_activate_os(spa);
 
 	spa_keystore_init(&spa->spa_keystore);
 
 	/*
 	 * This taskq is used to perform zvol-minor-related tasks
 	 * asynchronously. This has several advantages, including easy
 	 * resolution of various deadlocks.
 	 *
 	 * The taskq must be single threaded to ensure tasks are always
 	 * processed in the order in which they were dispatched.
 	 *
 	 * A taskq per pool allows one to keep the pools independent.
 	 * This way if one pool is suspended, it will not impact another.
 	 *
 	 * The preferred location to dispatch a zvol minor task is a sync
 	 * task. In this context, there is easy access to the spa_t and minimal
 	 * error handling is required because the sync task must succeed.
 	 */
 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 	    1, INT_MAX, 0);
 
 	/*
 	 * The taskq to preload metaslabs.
 	 */
 	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
 	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
 	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	spa_evicting_os_wait(spa);
 
 	if (spa->spa_zvol_taskq) {
 		taskq_destroy(spa->spa_zvol_taskq);
 		spa->spa_zvol_taskq = NULL;
 	}
 
 	if (spa->spa_metaslab_taskq) {
 		taskq_destroy(spa->spa_metaslab_taskq);
 		spa->spa_metaslab_taskq = NULL;
 	}
 
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
 	}
 
 	if (spa->spa_upgrade_taskq) {
 		taskq_destroy(spa->spa_upgrade_taskq);
 		spa->spa_upgrade_taskq = NULL;
 	}
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
 		spa->spa_txg_zio[i] = NULL;
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_embedded_log_class);
 	spa->spa_embedded_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
 	metaslab_class_destroy(spa->spa_dedup_class);
 	spa->spa_dedup_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 	avl_destroy(&spa->spa_errlist_healed);
 
 	spa_keystore_fini(&spa->spa_keystore);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 
 	spa_deactivate_os(spa);
 
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 static boolean_t
 spa_should_flush_logs_on_unload(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return (B_FALSE);
 
 	if (!spa_writeable(spa))
 		return (B_FALSE);
 
 	if (!spa->spa_sync_on)
 		return (B_FALSE);
 
 	if (spa_state(spa) != POOL_STATE_EXPORTED)
 		return (B_FALSE);
 
 	if (zfs_keep_log_spacemaps_at_export)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Opens a transaction that will set the flag that will instruct
  * spa_sync to attempt to flush all the metaslabs for that txg.
  */
 static void
 spa_unload_log_sm_flush_all(spa_t *spa)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
 }
 
 static void
 spa_unload_log_sm_metadata(spa_t *spa)
 {
 	void *cookie = NULL;
 	spa_log_sm_t *sls;
 	log_summary_entry_t *e;
 
 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 	    &cookie)) != NULL) {
 		VERIFY0(sls->sls_mscount);
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 
 	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
 		VERIFY0(e->lse_mscount);
 		kmem_free(e, sizeof (log_summary_entry_t));
 	}
 
 	spa->spa_unflushed_stats.sus_nblocks = 0;
 	spa->spa_unflushed_stats.sus_memused = 0;
 	spa->spa_unflushed_stats.sus_blocklimit = 0;
 }
 
 static void
 spa_destroy_aux_threads(spa_t *spa)
 {
 	if (spa->spa_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_condense_zthr);
 		spa->spa_condense_zthr = NULL;
 	}
 	if (spa->spa_checkpoint_discard_zthr != NULL) {
 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
 		spa->spa_checkpoint_discard_zthr = NULL;
 	}
 	if (spa->spa_livelist_delete_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_delete_zthr);
 		spa->spa_livelist_delete_zthr = NULL;
 	}
 	if (spa->spa_livelist_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_condense_zthr);
 		spa->spa_livelist_condense_zthr = NULL;
 	}
 	if (spa->spa_raidz_expand_zthr != NULL) {
 		zthr_destroy(spa->spa_raidz_expand_zthr);
 		spa->spa_raidz_expand_zthr = NULL;
 	}
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_load_note(spa, "UNLOADING");
 
 	spa_wake_waiters(spa);
 
 	/*
 	 * If we have set the spa_final_txg, we have already performed the
 	 * tasks below in spa_export_common(). We should not redo it here since
 	 * we delay the final TXGs beyond what spa_final_txg is set at.
 	 */
 	if (spa->spa_final_txg == UINT64_MAX) {
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		/*
 		 * Stop async tasks.
 		 */
 		spa_async_suspend(spa);
 
 		if (spa->spa_root_vdev) {
 			vdev_t *root_vdev = spa->spa_root_vdev;
 			vdev_initialize_stop_all(root_vdev,
 			    VDEV_INITIALIZE_ACTIVE);
 			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
 			vdev_autotrim_stop_all(spa);
 			vdev_rebuild_stop_all(spa);
 			l2arc_spa_rebuild_stop(spa);
 		}
 	}
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
 	taskq_wait(spa->spa_metaslab_taskq);
 
 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
 		spa->spa_vdev_removal = NULL;
 	}
 
 	spa_destroy_aux_threads(spa);
 
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 	brt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	if (spa->spa_spares.sav_vdevs) {
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			vdev_free(spa->spa_spares.sav_vdevs[i]);
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	if (spa->spa_l2cache.sav_vdevs) {
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 		}
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	spa->spa_indirect_vdevs_loaded = B_FALSE;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 	if (spa->spa_compatibility != NULL) {
 		spa_strfree(spa->spa_compatibility);
 		spa->spa_compatibility = NULL;
 	}
 
 	spa->spa_raidz_expand = NULL;
 
 	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As spare vdevs are shared among open pools, we skip loading
 	 * them when we load the checkpointed state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	if (spa->spa_spares.sav_vdevs) {
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 
 			/* Undo the call to spa_activate() below */
 			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 			    B_FALSE)) != NULL && tvd->vdev_isspare)
 				spa_spare_remove(tvd);
 			vdev_close(vd);
 			vdev_free(vd);
 		}
 
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 	}
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 	    spa->spa_spares.sav_count);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache = NULL;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As L2 caches are part of the ARC which is shared among open
 	 * pools, we skip loading them when we load the checkpointed
 	 * state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	if (sav->sav_config == NULL) {
 		nl2cache = 0;
 		newvdevs = NULL;
 		goto out;
 	}
 
 	VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 
 			/*
 			 * Upon cache device addition to a pool or pool
 			 * creation with a cache device or if the header
 			 * of the device is invalid we issue an async
 			 * TRIM command for the whole device which will
 			 * execute if l2arc_trim_ahead > 0.
 			 */
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	}
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
 
 	if (sav->sav_count > 0)
 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
 		    KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    (const nvlist_t * const *)l2cache, sav->sav_count);
 
 out:
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	if (oldvdevs) {
 		for (i = 0; i < oldnvdevs; i++) {
 			uint64_t pool;
 
 			vd = oldvdevs[i];
 			if (vd != NULL) {
 				ASSERT(vd->vdev_isl2cache);
 
 				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 				    pool != 0ULL && l2arc_vdev_present(vd))
 					l2arc_remove_vdev(vd);
 				vdev_clear_stats(vd);
 				vdev_free(vd);
 			}
 		}
 
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 	}
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error)
 		return (error);
 
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = vmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	vmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Concrete top-level vdevs that are not missing and are not logs. At every
  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
  */
 static uint64_t
 spa_healthy_core_tvds(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t tvds = 0;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog)
 			continue;
 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
 			tvds++;
 	}
 
 	return (tvds);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    vdev_is_concrete(vd)) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 static int
 spa_check_for_missing_logs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing log devices.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 		nv = fnvlist_alloc();
 
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			/*
 			 * We consider a device as missing only if it failed
 			 * to open (i.e. offline or faulted is not considered
 			 * as missing).
 			 */
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				child[idx++] = vdev_config_generate(spa, tvd,
 				    B_FALSE, VDEV_CONFIG_MISSING);
 			}
 		}
 
 		if (idx > 0) {
 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 			    (const nvlist_t * const *)child, idx);
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
 			for (uint64_t i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 
 		if (idx > 0) {
 			spa_load_failed(spa, "some log devices are missing");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			return (SET_ERROR(ENXIO));
 		}
 	} else {
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 				spa_load_note(spa, "some log devices are "
 				    "missing, ZIL is dropped.");
 				vdev_dbgmsg_print_tree(rvd, 2);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	default:
 		break;
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 /*
  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(tvd->vdev_mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 /*
  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_activate(tvd->vdev_mg);
 		}
 	}
 }
 
 int
 spa_reset_logs(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_reset,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	for (int i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
 		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	boolean_t	sle_verify_data;
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	abd_free(zio->io_abd);
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of inflight bytes is the log2 fraction of the arc size.
  * By default, we set it to 1/16th of the arc.
  */
 static uint_t spa_load_verify_shift = 4;
 static int spa_load_verify_metadata = B_TRUE;
 static int spa_load_verify_data = B_TRUE;
 
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zio_t *rio = arg;
 	spa_load_error_t *sle = rio->io_private;
 
 	(void) zilog, (void) dnp;
 
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 
 	/*
 	 * Sanity check the block pointer in order to detect obvious damage
 	 * before using the contents in subsequent checks or in zio_read().
 	 * When damaged consider it to be a metadata error since we cannot
 	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
 	 */
 	if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		atomic_inc_64(&sle->sle_meta_count);
 		return (0);
 	}
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	if (!BP_IS_METADATA(bp) &&
 	    (!spa_load_verify_data || !sle->sle_verify_data))
 		return (0);
 
 	uint64_t maxinflight_bytes =
 	    arc_target_bytes() >> spa_load_verify_shift;
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes += size;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) dp, (void) arg;
 
 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_load_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_load_policy(spa->spa_config, &policy);
 
 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
 	    policy.zlp_maxmeta == UINT64_MAX)
 		return (0);
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
 	    DS_FIND_CHILDREN);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Verify data only if we are rewinding or error limit was set.
 	 * Otherwise nothing except dbgmsg care about it to waste time.
 	 */
 	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
 	    (policy.zlp_maxdata < UINT64_MAX);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		if (spa->spa_extreme_rewind) {
 			spa_load_note(spa, "performing a complete scan of the "
 			    "pool since extreme rewind is on. This may take "
 			    "a very long time.\n  (spa_load_verify_data=%u, "
 			    "spa_load_verify_metadata=%u)",
 			    spa_load_verify_data, spa_load_verify_metadata);
 		}
 
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
 		    (u_longlong_t)sle.sle_data_count);
 	}
 
 	if (spa_load_verify_dryrun ||
 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
 	    sle.sle_data_count <= policy.zlp_maxdata)) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
 		    spa->spa_load_txg_ts);
 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
 		    loss);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (spa_load_verify_dryrun)
 		return (0);
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 {
 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val);
 
 	if (error != 0 && (error != ENOENT || log_enoent)) {
 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
 		    "[error=%d]", name, error);
 	}
 
 	return (error);
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (SET_ERROR(err));
 }
 
 boolean_t
 spa_livelist_delete_check(spa_t *spa)
 {
 	return (spa->spa_livelists_to_delete != 0);
 }
 
 static boolean_t
 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	return (spa_livelist_delete_check(spa));
 }
 
 static int
 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	zio_free(spa, tx->tx_txg, bp);
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	return (0);
 }
 
 static int
 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
 {
 	int err;
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	zap_cursor_init(&zc, os, zap_obj);
 	err = zap_cursor_retrieve(&zc, za);
 	zap_cursor_fini(&zc);
 	if (err == 0)
 		*llp = za->za_first_integer;
 	zap_attribute_free(za);
 	return (err);
 }
 
 /*
  * Components of livelist deletion that must be performed in syncing
  * context: freeing block pointers and updating the pool-wide data
  * structures to indicate how much work is left to do
  */
 typedef struct sublist_delete_arg {
 	spa_t *spa;
 	dsl_deadlist_t *ll;
 	uint64_t key;
 	bplist_t *to_free;
 } sublist_delete_arg_t;
 
 static void
 sublist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	sublist_delete_arg_t *sda = arg;
 	spa_t *spa = sda->spa;
 	dsl_deadlist_t *ll = sda->ll;
 	uint64_t key = sda->key;
 	bplist_t *to_free = sda->to_free;
 
 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
 	dsl_deadlist_remove_entry(ll, key, tx);
 }
 
 typedef struct livelist_delete_arg {
 	spa_t *spa;
 	uint64_t ll_obj;
 	uint64_t zap_obj;
 } livelist_delete_arg_t;
 
 static void
 livelist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_delete_arg_t *lda = arg;
 	spa_t *spa = lda->spa;
 	uint64_t ll_obj = lda->ll_obj;
 	uint64_t zap_obj = lda->zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t count;
 
 	/* free the livelist and decrement the feature count */
 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
 	dsl_deadlist_free(mos, ll_obj, tx);
 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	VERIFY0(zap_count(mos, zap_obj, &count));
 	if (count == 0) {
 		/* no more livelists to delete */
 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DELETED_CLONES, tx));
 		VERIFY0(zap_destroy(mos, zap_obj, tx));
 		spa->spa_livelists_to_delete = 0;
 		spa_notify_waiters(spa);
 	}
 }
 
 /*
  * Load in the value for the livelist to be removed and open it. Then,
  * load its first sublist and determine which block pointers should actually
  * be freed. Then, call a synctask which performs the actual frees and updates
  * the pool-wide livelist data.
  */
 static void
 spa_livelist_delete_cb(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	uint64_t ll_obj = 0, count;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj = spa->spa_livelists_to_delete;
 	/*
 	 * Determine the next livelist to delete. This function should only
 	 * be called if there is at least one deleted clone.
 	 */
 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
 	VERIFY0(zap_count(mos, ll_obj, &count));
 	if (count > 0) {
 		dsl_deadlist_t *ll;
 		dsl_deadlist_entry_t *dle;
 		bplist_t to_free;
 		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
 		VERIFY0(dsl_deadlist_open(ll, mos, ll_obj));
 		dle = dsl_deadlist_first(ll);
 		ASSERT3P(dle, !=, NULL);
 		bplist_create(&to_free);
 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
 		    z, NULL);
 		if (err == 0) {
 			sublist_delete_arg_t sync_arg = {
 			    .spa = spa,
 			    .ll = ll,
 			    .key = dle->dle_mintxg,
 			    .to_free = &to_free
 			};
 			zfs_dbgmsg("deleting sublist (id %llu) from"
 			    " livelist %llu, %lld remaining",
 			    (u_longlong_t)dle->dle_bpobj.bpo_object,
 			    (u_longlong_t)ll_obj, (longlong_t)count - 1);
 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 			    sublist_delete_sync, &sync_arg, 0,
 			    ZFS_SPACE_CHECK_DESTROY));
 		} else {
 			VERIFY3U(err, ==, EINTR);
 		}
 		bplist_clear(&to_free);
 		bplist_destroy(&to_free);
 		dsl_deadlist_close(ll);
 		kmem_free(ll, sizeof (dsl_deadlist_t));
 	} else {
 		livelist_delete_arg_t sync_arg = {
 		    .spa = spa,
 		    .ll_obj = ll_obj,
 		    .zap_obj = zap_obj
 		};
 		zfs_dbgmsg("deletion of livelist %llu completed",
 		    (u_longlong_t)ll_obj);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
 	}
 }
 
 static void
 spa_start_livelist_destroy_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
 	spa->spa_livelist_delete_zthr =
 	    zthr_create("z_livelist_destroy",
 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
 	    minclsyspri);
 }
 
 typedef struct livelist_new_arg {
 	bplist_t *allocs;
 	bplist_t *frees;
 } livelist_new_arg_t;
 
 static int
 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(tx == NULL);
 	livelist_new_arg_t *lna = arg;
 	if (bp_freed) {
 		bplist_append(lna->frees, bp);
 	} else {
 		bplist_append(lna->allocs, bp);
 		zfs_livelist_condense_new_alloc++;
 	}
 	return (0);
 }
 
 typedef struct livelist_condense_arg {
 	spa_t *spa;
 	bplist_t to_keep;
 	uint64_t first_size;
 	uint64_t next_size;
 } livelist_condense_arg_t;
 
 static void
 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_condense_arg_t *lca = arg;
 	spa_t *spa = lca->spa;
 	bplist_t new_frees;
 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
 
 	/* Have we been cancelled? */
 	if (spa->spa_to_condense.cancelled) {
 		zfs_livelist_condense_sync_cancel++;
 		goto out;
 	}
 
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 
 	/*
 	 * It's possible that the livelist was changed while the zthr was
 	 * running. Therefore, we need to check for new blkptrs in the two
 	 * entries being condensed and continue to track them in the livelist.
 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
 	 * we need to sort them into two different bplists.
 	 */
 	uint64_t first_obj = first->dle_bpobj.bpo_object;
 	uint64_t next_obj = next->dle_bpobj.bpo_object;
 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 
 	bplist_create(&new_frees);
 	livelist_new_arg_t new_bps = {
 	    .allocs = &lca->to_keep,
 	    .frees = &new_frees,
 	};
 
 	if (cur_first_size > lca->first_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->first_size));
 	}
 	if (cur_next_size > lca->next_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->next_size));
 	}
 
 	dsl_deadlist_clear_entry(first, ll, tx);
 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
 
 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
 	bplist_destroy(&new_frees);
 
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
 	    "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
 	    (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
 	    (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
 	    (u_longlong_t)cur_next_size,
 	    (u_longlong_t)first->dle_bpobj.bpo_object,
 	    (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
 out:
 	dmu_buf_rele(ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	spa->spa_to_condense.syncing = B_FALSE;
 }
 
 static void
 spa_livelist_condense_cb(void *arg, zthr_t *t)
 {
 	while (zfs_livelist_condense_zthr_pause &&
 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 		delay(1);
 
 	spa_t *spa = arg;
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	uint64_t first_size, next_size;
 
 	livelist_condense_arg_t *lca =
 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
 	bplist_create(&lca->to_keep);
 
 	/*
 	 * Process the livelists (matching FREEs and ALLOCs) in open context
 	 * so we have minimal work in syncing context to condense.
 	 *
 	 * We save bpobj sizes (first_size and next_size) to use later in
 	 * syncing context to determine if entries were added to these sublists
 	 * while in open context. This is possible because the clone is still
 	 * active and open for normal writes and we want to make sure the new,
 	 * unprocessed blockpointers are inserted into the livelist normally.
 	 *
 	 * Note that dsl_process_sub_livelist() both stores the size number of
 	 * blockpointers and iterates over them while the bpobj's lock held, so
 	 * the sizes returned to us are consistent which what was actually
 	 * processed.
 	 */
 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
 	    &first_size);
 	if (err == 0)
 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
 		    t, &next_size);
 
 	if (err == 0) {
 		while (zfs_livelist_condense_sync_pause &&
 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 			delay(1);
 
 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		dmu_tx_mark_netfree(tx);
 		dmu_tx_hold_space(tx, 1);
 		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
 		if (err == 0) {
 			/*
 			 * Prevent the condense zthr restarting before
 			 * the synctask completes.
 			 */
 			spa->spa_to_condense.syncing = B_TRUE;
 			lca->spa = spa;
 			lca->first_size = first_size;
 			lca->next_size = next_size;
 			dsl_sync_task_nowait(spa_get_dsl(spa),
 			    spa_livelist_condense_sync, lca, tx);
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 	/*
 	 * Condensing can not continue: either it was externally stopped or
 	 * we were unable to assign to a tx because the pool has run out of
 	 * space. In the second case, we'll just end up trying to condense
 	 * again in a later txg.
 	 */
 	ASSERT(err != 0);
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	if (err == EINTR)
 		zfs_livelist_condense_zthr_cancel++;
 }
 
 /*
  * Check that there is something to condense but that a condense is not
  * already in progress and that condensing has not been cancelled.
  */
 static boolean_t
 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	if ((spa->spa_to_condense.ds != NULL) &&
 	    (spa->spa_to_condense.syncing == B_FALSE) &&
 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static void
 spa_start_livelist_condensing_thread(spa_t *spa)
 {
 	spa->spa_to_condense.ds = NULL;
 	spa->spa_to_condense.first = NULL;
 	spa->spa_to_condense.next = NULL;
 	spa->spa_to_condense.syncing = B_FALSE;
 	spa->spa_to_condense.cancelled = B_FALSE;
 
 	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
 	spa->spa_livelist_condense_zthr =
 	    zthr_create("z_livelist_condense",
 	    spa_livelist_condense_cb_check,
 	    spa_livelist_condense_cb, spa, minclsyspri);
 }
 
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_start_raidz_expansion_thread(spa);
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
 	spa_start_livelist_condensing_thread(spa);
 
 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
 	spa->spa_checkpoint_discard_zthr =
 	    zthr_create("z_checkpoint_discard",
 	    spa_checkpoint_discard_thread_check,
 	    spa_checkpoint_discard_thread, spa, minclsyspri);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
 	const char *ereport = FM_EREPORT_ZFS_POOL;
 	int error;
 
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 	spa_import_progress_set_notes(spa, "spa_load()");
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			(void) zfs_ereport_post(ereport, spa,
 			    NULL, NULL, NULL, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 
 	return (error);
 }
 
 #ifdef ZFS_DEBUG
 /*
  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
  * spa's per-vdev ZAP list.
  */
 static uint64_t
 vdev_count_verify_zaps(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t total = 0;
 
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
 	    vd->vdev_root_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
 	}
 	if (vd->vdev_top_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
 	}
 
 	return (total);
 }
 #else
 #define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
 #endif
 
 /*
  * Determine whether the activity check is required.
  */
 static boolean_t
 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
     nvlist_t *config)
 {
 	uint64_t state = 0;
 	uint64_t hostid = 0;
 	uint64_t tryconfig_txg = 0;
 	uint64_t tryconfig_timestamp = 0;
 	uint16_t tryconfig_mmp_seq = 0;
 	nvlist_t *nvinfo;
 
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
 		    &tryconfig_txg);
 		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    &tryconfig_timestamp);
 		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
 		    &tryconfig_mmp_seq);
 	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
 
 	/*
 	 * Disable the MMP activity check - This is used by zdb which
 	 * is intended to be used on potentially active pools.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity check when the MMP feature is disabled.
 	 */
 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
 		return (B_FALSE);
 
 	/*
 	 * If the tryconfig_ values are nonzero, they are the results of an
 	 * earlier tryimport.  If they all match the uberblock we just found,
 	 * then the pool has not changed and we return false so we do not test
 	 * a second time.
 	 */
 	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
 	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
 	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
 	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
 		return (B_FALSE);
 
 	/*
 	 * Allow the activity check to be skipped when importing the pool
 	 * on the same host which last imported it.  Since the hostid from
 	 * configuration may be stale use the one read from the label.
 	 */
 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
 	if (hostid == spa_get_hostid(spa))
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity test when the pool was cleanly exported.
 	 */
 	if (state != POOL_STATE_ACTIVE)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Nanoseconds the activity check must watch for changes on-disk.
  */
 static uint64_t
 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 {
 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
 	uint64_t multihost_interval = MSEC2NSEC(
 	    MMP_INTERVAL_OK(zfs_multihost_interval));
 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
 	    multihost_interval);
 
 	/*
 	 * Local tunables determine a minimum duration except for the case
 	 * where we know when the remote host will suspend the pool if MMP
 	 * writes do not land.
 	 *
 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
 	 * these cases and times.
 	 */
 
 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
 
 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) > 0) {
 
 		/* MMP on remote host will suspend pool after failed writes */
 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
 		    MMP_IMPORT_SAFETY_FACTOR / 100;
 
 		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
 		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_FAIL_INT(ub),
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) == 0) {
 
 		/* MMP on remote host will never suspend pool */
 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
 		    "mmp_interval=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_VALID(ub)) {
 		/*
 		 * zfs-0.7 compatibility case
 		 */
 
 		import_delay = MAX(import_delay, (multihost_interval +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu leaves=%u",
 		    (u_longlong_t)import_delay,
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals,
 		    vdev_count_leaves(spa));
 	} else {
 		/* Using local tunings is the only reasonable option */
 		zfs_dbgmsg("pool last imported on non-MMP aware "
 		    "host using import_delay=%llu multihost_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)multihost_interval,
 		    (u_longlong_t)import_intervals);
 	}
 
 	return (import_delay);
 }
 
 /*
  * Remote host activity check.
  *
  * error results:
  *          0 - no activity detected
  *  EREMOTEIO - remote activity detected
  *      EINTR - user canceled the operation
  */
 static int
 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
     boolean_t importing)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
 	hrtime_t import_expire, now;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
 	kmutex_t mtx;
 	int error = 0;
 
 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_enter(&mtx);
 
 	/*
 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
 	 * during the earlier tryimport.  If the txg recorded there is 0 then
 	 * the pool is known to be active on another host.
 	 *
 	 * Otherwise, the pool might be in use on another host.  Check for
 	 * changes in the uberblocks on disk if necessary.
 	 */
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_LOAD_INFO);
 
 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
 		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
 			vdev_uberblock_load(rvd, ub, &mmp_label);
 			error = SET_ERROR(EREMOTEIO);
 			goto out;
 		}
 	}
 
 	import_delay = spa_activity_check_duration(spa, ub);
 
 	/* Add a small random factor in case of simultaneous imports (0-25%) */
 	import_delay += import_delay * random_in_range(250) / 1000;
 
 	import_expire = gethrtime() + import_delay;
 
 	if (importing) {
 		spa_import_progress_set_notes(spa, "Checking MMP activity, "
 		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
 	}
 
 	int iterations = 0;
 	while ((now = gethrtime()) < import_expire) {
 		if (importing && iterations++ % 30 == 0) {
 			spa_import_progress_set_notes(spa, "Checking MMP "
 			    "activity, %llu ms remaining",
 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
 		}
 
 		if (importing) {
 			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
 			    NSEC2SEC(import_expire - gethrtime()));
 		}
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
 		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
 		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
 			zfs_dbgmsg("multihost activity detected "
 			    "txg %llu ub_txg  %llu "
 			    "timestamp %llu ub_timestamp  %llu "
 			    "mmp_config %#llx ub_mmp_config %#llx",
 			    (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
 			    (u_longlong_t)timestamp,
 			    (u_longlong_t)ub->ub_timestamp,
 			    (u_longlong_t)mmp_config,
 			    (u_longlong_t)ub->ub_mmp_config);
 
 			error = SET_ERROR(EREMOTEIO);
 			break;
 		}
 
 		if (mmp_label) {
 			nvlist_free(mmp_label);
 			mmp_label = NULL;
 		}
 
 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
 		if (error != -1) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 		error = 0;
 	}
 
 out:
 	mutex_exit(&mtx);
 	mutex_destroy(&mtx);
 	cv_destroy(&cv);
 
 	/*
 	 * If the pool is determined to be active store the status in the
 	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
 	 * available from configuration read from disk store them as well.
 	 * This allows 'zpool import' to generate a more useful message.
 	 *
 	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
 	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
 		const char *hostname = "<unknown>";
 		uint64_t hostid = 0;
 
 		if (mmp_label) {
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
 				hostname = fnvlist_lookup_string(mmp_label,
 				    ZPOOL_CONFIG_HOSTNAME);
 				fnvlist_add_string(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 			}
 
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
 				hostid = fnvlist_lookup_uint64(mmp_label,
 				    ZPOOL_CONFIG_HOSTID);
 				fnvlist_add_uint64(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
 			}
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, 0);
 
 		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
 	}
 
 	if (mmp_label)
 		nvlist_free(mmp_label);
 
 	return (error);
 }
 
 /*
  * Called from zfs_ioc_clear for a pool that was suspended
  * after failing mmp write checks.
  */
 boolean_t
 spa_mmp_remote_host_activity(spa_t *spa)
 {
 	ASSERT(spa_multihost(spa) && spa_suspended(spa));
 
 	nvlist_t *best_label;
 	uberblock_t best_ub;
 
 	/*
 	 * Locate the best uberblock on disk
 	 */
 	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
 	if (best_label) {
 		/*
 		 * confirm that the best hostid matches our hostid
 		 */
 		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
 		    spa_get_hostid(spa) !=
 		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
 			nvlist_free(best_label);
 			return (B_TRUE);
 		}
 		nvlist_free(best_label);
 	} else {
 		return (B_TRUE);
 	}
 
 	if (!MMP_VALID(&best_ub) ||
 	    !MMP_FAIL_INT_VALID(&best_ub) ||
 	    MMP_FAIL_INT(&best_ub) == 0) {
 		return (B_TRUE);
 	}
 
 	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
 	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
 		zfs_dbgmsg("txg mismatch detected during pool clear "
 		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
 		    (u_longlong_t)spa->spa_uberblock.ub_txg,
 		    (u_longlong_t)best_ub.ub_txg,
 		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
 		    (u_longlong_t)best_ub.ub_timestamp);
 		return (B_TRUE);
 	}
 
 	/*
 	 * Perform an activity check looking for any remote writer
 	 */
 	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
 	    B_FALSE) != 0);
 }
 
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
 	uint64_t hostid;
 	const char *hostname;
 	uint64_t myhostid = 0;
 
 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 		hostname = fnvlist_lookup_string(mos_config,
 		    ZPOOL_CONFIG_HOSTNAME);
 
 		myhostid = zone_get_hostid(NULL);
 
 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
 			cmn_err(CE_WARN, "pool '%s' could not be "
 			    "loaded as it was last accessed by "
 			    "another system (host: %s hostid: 0x%llx). "
 			    "See: https://openzfs.github.io/openzfs-docs/msg/"
 			    "ZFS-8000-EY",
 			    spa_name(spa), hostname, (u_longlong_t)hostid);
 			spa_load_failed(spa, "hostid verification failed: pool "
 			    "last accessed by host: %s (hostid: 0x%llx)",
 			    hostname, (u_longlong_t)hostid);
 			return (SET_ERROR(EBADF));
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 	int parse;
 	vdev_t *rvd;
 	uint64_t pool_guid;
 	const char *comment;
 	const char *compatibility;
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we are doing an import, ensure that the pool is not already
 	 * imported by checking if its pool guid already exists in the
 	 * spa namespace.
 	 *
 	 * The only case that we allow an already imported pool to be
 	 * imported again, is when the pool is checkpointed and we want to
 	 * look at its checkpointed state from userland tools like zdb.
 	 */
 #ifdef _KERNEL
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 #else
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0) &&
 	    !spa_importing_readonly_checkpoint(spa)) {
 #endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
 	}
 
 	spa->spa_config_guid = pool_guid;
 
 	nvlist_free(spa->spa_load_info);
 	spa->spa_load_info = fnvlist_alloc();
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	ASSERT(spa->spa_compatibility == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
 	    &compatibility) == 0)
 		spa->spa_compatibility = spa_strdup(compatibility);
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
 		spa->spa_config_splitting = fnvlist_dup(nvl);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_VDEV_TREE);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to parse config [error=%d]",
 		    error);
 		return (error);
 	}
 
 	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	return (0);
 }
 
 /*
  * Recursively open all vdevs in the vdev tree. This function is called twice:
  * first with the untrusted config, then with the trusted config.
  */
 static int
 spa_ld_open_vdevs(spa_t *spa)
 {
 	int error = 0;
 
 	/*
 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
 	 * missing/unopenable for the root vdev to be still considered openable.
 	 */
 	if (spa->spa_trust_config) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
 	} else {
 		spa->spa_missing_tvds_allowed = 0;
 	}
 
 	spa->spa_missing_tvds_allowed =
 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (spa->spa_missing_tvds != 0) {
 		spa_load_note(spa, "vdev tree has %lld missing top-level "
 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
 			/*
 			 * Although theoretically we could allow users to open
 			 * incomplete pools in RW mode, we'd need to add a lot
 			 * of extra logic (e.g. adjust pool space to account
 			 * for missing vdevs).
 			 * This limitation also prevents users from accidentally
 			 * opening the pool in RW mode during data recovery and
 			 * damaging it further.
 			 */
 			spa_load_note(spa, "pools with missing top-level "
 			    "vdevs can only be opened in read-only mode.");
 			error = SET_ERROR(ENXIO);
 		} else {
 			spa_load_note(spa, "current settings allow for maximum "
 			    "%lld missing top-level vdevs at this stage.",
 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
 		}
 	}
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 		    error);
 	}
 	if (spa->spa_missing_tvds != 0 || error != 0)
 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 
 	return (error);
 }
 
 /*
  * We need to validate the vdev labels against the configuration that
  * we have in hand. This function is called twice: first with an untrusted
  * config, then with a trusted config. The validation is more strict when the
  * config is trusted.
  */
 static int
 spa_ld_validate_vdevs(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
 		return (error);
 	}
 
 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
 		    "some vdevs");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		return (SET_ERROR(ENXIO));
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
 {
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 }
 
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;
 	boolean_t activity_check = B_FALSE;
 
 	/*
 	 * If we are opening the checkpointed state of the pool by
 	 * rewinding to it, at this point we will have written the
 	 * checkpointed uberblock to the vdev labels, so searching
 	 * the labels will find the right uberblock.  However, if
 	 * we are opening the checkpointed state read-only, we have
 	 * not modified the labels. Therefore, we must ignore the
 	 * labels and continue using the spa_uberblock that was set
 	 * by spa_ld_checkpoint_rewind.
 	 *
 	 * Note that it would be fine to ignore the labels when
 	 * rewinding (opening writeable) as well. However, if we
 	 * crash just after writing the labels, we will end up
 	 * searching the labels. Doing so in the common case means
 	 * that this code path gets exercised normally, rather than
 	 * just in the edge case.
 	 */
 	if (ub->ub_checkpoint_txg != 0 &&
 	    spa_importing_readonly_checkpoint(spa)) {
 		spa_ld_select_uberblock_done(spa, ub);
 		return (0);
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		spa_load_failed(spa, "no valid uberblock found");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	if (spa->spa_load_max_txg != UINT64_MAX) {
 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
 		    (u_longlong_t)spa->spa_load_max_txg);
 	}
 	spa_load_note(spa, "using uberblock with txg=%llu",
 	    (u_longlong_t)ub->ub_txg);
 	if (ub->ub_raidz_reflow_info != 0) {
 		spa_load_note(spa, "uberblock raidz_reflow_info: "
 		    "state=%u offset=%llu",
 		    (int)RRSS_GET_STATE(ub),
 		    (u_longlong_t)RRSS_GET_OFFSET(ub));
 	}
 
 
 	/*
 	 * For pools which have the multihost property on determine if the
 	 * pool is truly inactive and can be safely imported.  Prevent
 	 * hosts which don't have a hostid set from importing the pool.
 	 */
 	activity_check = spa_activity_check_required(spa, ub, label,
 	    spa->spa_config);
 	if (activity_check) {
 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
 		    spa_get_hostid(spa) == 0) {
 			nvlist_free(label);
 			fnvlist_add_uint64(spa->spa_load_info,
 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
 		int error =
 		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
 		if (error) {
 			nvlist_free(label);
 			return (error);
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
 		fnvlist_add_uint16(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_SEQ,
 		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		spa_load_failed(spa, "version %llu is not supported",
 		    (u_longlong_t)ub->ub_version);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL) {
 			spa_load_failed(spa, "label config unavailable");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) != 0) {
 			nvlist_free(label);
 			spa_load_failed(spa, "invalid label: '%s' missing",
 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		spa->spa_label_features = fnvlist_dup(features);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		unsup_feat = fnvlist_alloc();
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				fnvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "");
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 			nvlist_free(unsup_feat);
 			spa_load_failed(spa, "some features are unsupported");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, spa->spa_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa_ld_select_uberblock_done(spa, ub);
 
 	return (0);
 }
 
 static int
 spa_ld_open_rootbp(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	return (0);
 }
 
 static int
 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv, *mos_config, *policy;
 	int error = 0, copy_error;
 	uint64_t healthy_tvds, healthy_tvds_mos;
 	uint64_t mos_config_txg;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 	    != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling a pool from a split, the config provided is
 	 * already trusted so there is nothing to do.
 	 */
 	if (type == SPA_IMPORT_ASSEMBLE)
 		return (0);
 
 	healthy_tvds = spa_healthy_core_tvds(spa);
 
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
 	    != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * If we are doing an open, pool owner wasn't verified yet, thus do
 	 * the verification here.
 	 */
 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
 		error = spa_verify_host(spa, mos_config);
 		if (error != 0) {
 			nvlist_free(mos_config);
 			return (error);
 		}
 	}
 
 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Build a new vdev tree from the trusted config
 	 */
 	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
 	if (error != 0) {
 		nvlist_free(mos_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
 	 * obtained by scanning /dev/dsk, then it will have the right vdev
 	 * paths. We update the trusted MOS config with this information.
 	 * We first try to copy the paths with vdev_copy_path_strict, which
 	 * succeeds only when both configs have exactly the same vdev tree.
 	 * If that fails, we fall back to a more flexible method that has a
 	 * best effort policy.
 	 */
 	copy_error = vdev_copy_path_strict(rvd, mrvd);
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "provided vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		spa_load_note(spa, "MOS vdev tree:");
 		vdev_dbgmsg_print_tree(mrvd, 2);
 	}
 	if (copy_error != 0) {
 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
 		    "back to vdev_copy_path_relaxed");
 		vdev_copy_path_relaxed(rvd, mrvd);
 	}
 
 	vdev_close(rvd);
 	vdev_free(rvd);
 	spa->spa_root_vdev = mrvd;
 	rvd = mrvd;
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If 'zpool import' used a cached config, then the on-disk hostid and
 	 * hostname may be different to the cached config in ways that should
 	 * prevent import.  Userspace can't discover this without a scan, but
 	 * we know, so we add these values to LOAD_INFO so the caller can know
 	 * the difference.
 	 *
 	 * Note that we have to do this before the config is regenerated,
 	 * because the new config will have the hostid and hostname for this
 	 * host, in readiness for import.
 	 */
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
 		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
 		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
 		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
 
 	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
 	 * pass settings on how to load the pool and is not stored in the MOS.
 	 * We copy it over to our new, trusted config.
 	 */
 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_POOL_TXG);
 	nvlist_free(mos_config);
 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
 	    &policy) == 0)
 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
 	spa_config_set(spa, mos_config);
 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
 
 	/*
 	 * Now that we got the config from the MOS, we should be more strict
 	 * in checking blkptrs and can make assumptions about the consistency
 	 * of the vdev tree. spa_trust_config must be set to true before opening
 	 * vdevs in order for them to be writeable.
 	 */
 	spa->spa_trust_config = B_TRUE;
 
 	/*
 	 * Open and validate the new vdev tree
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	error = spa_ld_validate_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "final vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 	}
 
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 		/*
 		 * Sanity check to make sure that we are indeed loading the
 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
 		 * in the config provided and they happened to be the only ones
 		 * to have the latest uberblock, we could involuntarily perform
 		 * an extreme rewind.
 		 */
 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
 		if (healthy_tvds_mos - healthy_tvds >=
 		    SPA_SYNC_MIN_VDEVS) {
 			spa_load_note(spa, "config provided misses too many "
 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
 			    (u_longlong_t)healthy_tvds,
 			    (u_longlong_t)healthy_tvds_mos);
 			spa_load_note(spa, "vdev tree:");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			if (reloading) {
 				spa_load_failed(spa, "config was already "
 				    "provided from MOS. Aborting.");
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 			spa_load_note(spa, "spa must be reloaded using MOS "
 			    "config");
 			return (SET_ERROR(EAGAIN));
 		}
 	}
 
 	error = spa_check_for_missing_logs(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
 		    "guid sum (%llu != %llu)",
 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
 		    (u_longlong_t)rvd->vdev_guid_sum);
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 		    ENXIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * Everything that we read before spa_remove_init() must be stored
 	 * on concreted vdevs.  Therefore we do this as early as possible.
 	 */
 	error = spa_remove_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Retrieve information needed to condense indirect vdev mappings.
 	 */
 	error = spa_condense_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) ||
 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				*missing_feat_writep = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (*missing_feat_writep &&
 		    spa_writeable(spa))) {
 			spa_load_failed(spa, "pool uses unsupported features");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				spa_load_failed(spa, "error getting refcount "
 				    "for feature %s [error=%d]",
 				    spa_feature_table[i].fi_guid, error);
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Encryption was added before bookmark_v2, even though bookmark_v2
 	 * is now a dependency. If this pool has encryption enabled without
 	 * bookmark_v2, trigger an errata message.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_special_directories(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0) {
 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_get_props(spa_t *spa)
 {
 	int error = 0;
 	uint64_t obj;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/* Grab the checksum salt from the MOS. */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 	if (error == ENOENT) {
 		/* Generate a new salt for subsequent use */
 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
 	} else if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checksum salt from "
 		    "MOS [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0) {
 		spa_load_failed(spa, "error opening deferred-frees bpobj "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/* Load the last scrubbed txg. */
 	error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG,
 	    &spa->spa_scrubbed_last_txg, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the livelist deletion field. If a livelist is queued for
 	 * deletion, indicate that in the spa
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
 	    &spa->spa_livelists_to_delete, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
 	 * be present; in this case, defer its creation to a later time to
 	 * avoid dirtying the MOS this early / out of sync context. See
 	 * spa_sync_config_object.
 	 */
 
 	/* The sentinel is only available in the MOS config. */
 	nvlist_t *mos_config;
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
 	    &spa->spa_all_vdev_zaps, B_FALSE);
 
 	if (error == ENOENT) {
 		VERIFY(!nvlist_exists(mos_config,
 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	} else if (error != 0) {
 		nvlist_free(mos_config);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 		/*
 		 * An older version of ZFS overwrote the sentinel value, so
 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
 		 * destruction to later; see spa_sync_config_object.
 		 */
 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
 		/*
 		 * We're assuming that no vdevs have had their ZAPs created
 		 * before this. Better be sure of it.
 		 */
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	}
 	nvlist_free(mos_config);
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
 	    B_FALSE);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace = 0;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
 		    &spa->spa_dedup_table_quota);
 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If we are importing a pool with missing top-level vdevs,
 	 * we enforce that the pool doesn't panic or get suspended on
 	 * error since the likelihood of missing data is extremely high.
 	 */
 	if (spa->spa_missing_tvds > 0 &&
 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_load_note(spa, "forcing failmode to 'continue' "
 		    "as some top level vdevs are missing");
 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0) {
 			spa_load_failed(spa, "error loading spares nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0) {
 			spa_load_failed(spa, "error loading l2cache nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If the 'multihost' property is set, then never allow a pool to
 	 * be imported when the system hostid is zero.  The exception to
 	 * this rule is zdb which is always allowed to access pools.
 	 */
 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
 	 */
 	error = vdev_load(rvd);
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	error = spa_ld_log_spacemaps(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	return (0);
 }
 
 static int
 spa_ld_load_dedup_tables(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = ddt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_brt(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = brt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "brt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 		boolean_t missing = spa_check_logs(spa);
 		if (missing) {
 			if (spa->spa_missing_tvds != 0) {
 				spa_load_note(spa, "spa_check_logs failed "
 				    "so dropping the logs");
 			} else {
 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 				spa_load_failed(spa, "spa_check_logs failed");
 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
 				    ENXIO));
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_pool_data(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		error = spa_load_verify(spa);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_load_verify failed "
 			    "[error=%d]", error);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 		}
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_claim_log_blocks(spa_t *spa)
 {
 	dmu_tx_t *tx;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	/*
 	 * Claim log blocks that haven't been committed yet.
 	 * This must all happen in a single txg.
 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 	 * invoked from zil_claim_log_block()'s i/o done callback.
 	 * Price of rollback is that we abandon the log.
 	 */
 	spa->spa_claiming = B_TRUE;
 
 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    zil_claim, tx, DS_FIND_CHILDREN);
 	dmu_tx_commit(tx);
 
 	spa->spa_claiming = B_FALSE;
 
 	spa_set_log_state(spa, SPA_LOG_GOOD);
 }
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
     boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
 
 	/*
 	 * If the config cache is stale, or we have uninitialized
 	 * metaslabs (see spa_vdev_add()), then update the config.
 	 *
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 		need_update = B_TRUE;
 
 	for (int c = 0; c < rvd->vdev_children; c++)
 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
 			need_update = B_TRUE;
 
 	/*
 	 * Update the config cache asynchronously in case we're the
 	 * root pool, in which case the config cache isn't writable yet.
 	 */
 	if (need_update)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 static void
 spa_ld_prepare_for_reload(spa_t *spa)
 {
 	spa_mode_t mode = spa->spa_mode;
 	int async_suspended = spa->spa_async_suspended;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_activate(spa, mode);
 
 	/*
 	 * We save the value of spa_async_suspended as it gets reset to 0 by
 	 * spa_unload(). We want to restore it back to the original value before
 	 * returning as we might be calling spa_async_resume() later.
 	 */
 	spa->spa_async_suspended = async_suspended;
 }
 
 static int
 spa_ld_read_checkpoint_txg(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_load_thread == curthread);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT3U(checkpoint.ub_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 
 	return (0);
 }
 
 static int
 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
 	 * Never trust the config that is provided unless we are assembling
 	 * a pool following a split.
 	 * This means don't trust blkptrs and the vdev tree in general. This
 	 * also effectively puts the spa in read-only mode since
 	 * spa_writeable() checks for spa_trust_config to be true.
 	 * We will later load a trusted config from the MOS.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;
 
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
 	error = spa_ld_parse_config(spa, type);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_add(spa);
 
 	/*
 	 * Now that we have the vdev tree, try to open each vdev. This involves
 	 * opening the underlying physical device, retrieving its geometry and
 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
 	 * based on the success of those operations. After this we'll be ready
 	 * to read from the vdevs.
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Read the label of each vdev and make sure that the GUIDs stored
 	 * there match the GUIDs in the config provided.
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		error = spa_ld_validate_vdevs(spa);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Read all vdev labels to find the best uberblock (i.e. latest,
 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
 	 * get the list of features required to read blkptrs in the MOS from
 	 * the vdev label with the best uberblock and verify that our version
 	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Pass that uberblock to the dsl_pool layer which will open the root
 	 * blkptr. This blkptr points to the latest version of the MOS and will
 	 * allow us to read its contents.
 	 */
 	error = spa_ld_open_rootbp(spa);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 spa_ld_checkpoint_rewind(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checkpointed "
 		    "uberblock from the MOS config [error=%d]", error);
 
 		if (error == ENOENT)
 			error = ZFS_ERR_NO_CHECKPOINT;
 
 		return (error);
 	}
 
 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
 
 	/*
 	 * We need to update the txg and timestamp of the checkpointed
 	 * uberblock to be higher than the latest one. This ensures that
 	 * the checkpointed uberblock is selected if we were to close and
 	 * reopen the pool right after we've written it in the vdev labels.
 	 * (also see block comment in vdev_uberblock_compare)
 	 */
 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
 	checkpoint.ub_timestamp = gethrestime_sec();
 
 	/*
 	 * Set current uberblock to be the checkpointed uberblock.
 	 */
 	spa->spa_uberblock = checkpoint;
 
 	/*
 	 * If we are doing a normal rewind, then the pool is open for
 	 * writing and we sync the "updated" checkpointed uberblock to
 	 * disk. Once this is done, we've basically rewound the whole
 	 * pool and there is no way back.
 	 *
 	 * There are cases when we don't want to attempt and sync the
 	 * checkpointed uberblock to disk because we are opening a
 	 * pool as read-only. Specifically, verifying the checkpointed
 	 * state with zdb, and importing the checkpointed state to get
 	 * a "preview" of its content.
 	 */
 	if (spa_writeable(spa)) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 		int svdcount = 0;
 		int children = rvd->vdev_children;
 		int c0 = random_in_range(children);
 
 		for (int c = 0; c < children; c++) {
 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
 
 			/* Stop when revisiting the first vdev */
 			if (c > 0 && svd[0] == vd)
 				break;
 
 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 			    !vdev_is_concrete(vd))
 				continue;
 
 			svd[svdcount++] = vd;
 			if (svdcount == SPA_SYNC_MIN_VDEVS)
 				break;
 		}
 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0) {
 			spa_load_failed(spa, "failed to write checkpointed "
 			    "uberblock to the vdev labels [error=%d]", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t *update_config_cache)
 {
 	int error;
 
 	/*
 	 * Parse the config for pool, open and validate vdevs,
 	 * select an uberblock, and use that uberblock to open
 	 * the MOS.
 	 */
 	error = spa_ld_mos_init(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
 	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
 		if (update_config_cache != NULL)
 			*update_config_cache = B_TRUE;
 
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "RELOADING");
 		error = spa_ld_mos_init(spa, type);
 		if (error != 0)
 			return (error);
 
 		error = spa_ld_trusted_config(spa, type, B_TRUE);
 		if (error != 0)
 			return (error);
 
 	} else if (error != 0) {
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Load an existing storage pool, using the config provided. This config
  * describes which vdevs are part of the pool and is later validated against
  * partial configs present in each vdev's label and an entire copy of the
  * config stored in the MOS.
  */
 static int
 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	int error = 0;
 	boolean_t missing_feat_write = B_FALSE;
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
 	hrtime_t load_start = gethrtime();
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	spa_load_note(spa, "LOADING");
 
 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If we are rewinding to the checkpoint then we need to repeat
 	 * everything we've done so far in this function but this time
 	 * selecting the checkpointed uberblock and using that to open
 	 * the MOS.
 	 */
 	if (checkpoint_rewind) {
 		/*
 		 * If we are rewinding to the checkpoint update config cache
 		 * anyway.
 		 */
 		update_config_cache = B_TRUE;
 
 		/*
 		 * Extract the checkpointed uberblock from the current MOS
 		 * and use this as the pool's uberblock from now on. If the
 		 * pool is imported as writeable we also write the checkpoint
 		 * uberblock to the labels, making the rewind permanent.
 		 */
 		error = spa_ld_checkpoint_rewind(spa);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Redo the loading process again with the
 		 * checkpointed uberblock.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "LOADING checkpointed uberblock");
 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Drop the namespace lock for the rest of the function.
 	 */
 	spa->spa_load_thread = curthread;
 	mutex_exit(&spa_namespace_lock);
 
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
 	 * that everything that we read before this step must have been
 	 * rewritten on concrete vdevs after the last device removal was
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
 	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
 	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
 		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
 		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading BRT");
 	error = spa_ld_load_brt(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
 		goto fail;
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
 		    ENOTSUP);
 		goto fail;
 	}
 
 	/*
 	 * Traverse the last txgs to make sure the pool was left off in a safe
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
 	spa_import_progress_set_notes(spa, "Calculating deflated space");
 	spa_update_dspace(spa);
 
 	/*
 	 * We have now retrieved all the information we needed to open the
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
 	spa_import_progress_set_notes(spa, "Starting import");
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
 
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Before we do any zio_write's, complete the raidz expansion
 		 * scratch space copying, if necessary.
 		 */
 		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
 			vdev_raidz_reflow_copy_scratch(spa);
 
 		/*
 		 * In case of a checkpoint rewind, log the original txg
 		 * of the checkpointed uberblock.
 		 */
 		if (checkpoint_rewind) {
 			spa_history_log_internal(spa, "checkpoint rewind",
 			    NULL, "rewound state to txg=%llu",
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
 		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
 		spa_ld_claim_log_blocks(spa);
 
 		/*
 		 * Kick-off the syncing thread.
 		 */
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 		mmp_thread_start(spa);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
 		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * Check if we need to request an update of the config. On the
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_import_progress_set_notes(spa, "Updating configs");
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
 		/*
 		 * Check if a rebuild was in progress and if so resume it.
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
 		spa_import_progress_set_notes(spa, "Starting resilvers");
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
 		spa_import_progress_set_notes(spa,
 		    "Restarting device removals");
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
 		/*
 		 * Delete any inconsistent datasets.
 		 *
 		 * Note:
 		 * Since we may be issuing deletes for clones here,
 		 * we make sure to do so after we've spawned all the
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up inconsistent objsets");
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up temporary userrefs");
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_import_progress_set_notes(spa, "Restarting initialize");
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_import_progress_set_notes(spa, "Restarting TRIM");
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_import_progress_set_notes(spa, "Finished importing");
 	}
 	zio_handle_import_delay(spa, gethrtime() - load_start);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
 fail:
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_load_thread = NULL;
 	cv_broadcast(&spa_namespace_cv);
 
 	return (error);
 
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
 	spa_mode_t mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 	    (u_longlong_t)spa->spa_load_max_txg);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
     int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
 	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
 		/*
 		 * When attempting checkpoint-rewind on a pool with no
 		 * checkpoint, we should not attempt to load uberblocks
 		 * from previous txgs when spa_load fails.
 		 */
 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 	else
 		nvlist_free(config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		spa_import_progress_remove(spa_guid(spa));
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
     nvlist_t *nvpolicy, nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_load_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
 		zfs_dbgmsg("spa_open_common: opening %s", pool);
 		error = spa_load_best(spa, state, policy.zlp_txg,
 		    policy.zlp_rewind);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				*config = fnvlist_dup(spa->spa_config);
 				fnvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER && config != NULL) {
 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (firstopen)
 		zvol_create_minors_recursive(spa_name(spa));
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
     nvlist_t *policy, nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, const void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 	if (nspares != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t * const *)spares, nspares);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares));
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			guid = fnvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID);
 			VERIFY0(nvlist_lookup_uint64_array(spares[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			} else {
 				vs->vs_state =
 				    spa->spa_spares.sav_vdevs[i]->vdev_state;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	if (nl2cache != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    (const nvlist_t * const *)l2cache, nl2cache);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache));
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			guid = fnvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			vdev_get_stats(vd, vs);
 			vdev_config_generate_stats(vd, l2cache[i]);
 
 		}
 	}
 }
 
 static void
 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
 			    za->za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za->za_name,
 			    za->za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
 			    za->za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za->za_name,
 			    za->za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 	zap_attribute_free(za);
 }
 
 static void
 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
 {
 	int i;
 
 	for (i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t feature = spa_feature_table[i];
 		uint64_t refcount;
 
 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
 			continue;
 
 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
 	}
 }
 
 /*
  * Store a list of pool features and their reference counts in the
  * config.
  *
  * The first time this is called on a spa, allocate a new nvlist, fetch
  * the pool features and reference counts from disk, then save the list
  * in the spa. In subsequent calls on the same spa use the saved nvlist
  * and refresh its values from the cached reference counts.  This
  * ensures we don't block here on I/O on a suspended pool so 'zpool
  * clear' can resume the pool.
  */
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	mutex_enter(&spa->spa_feat_stats_lock);
 	features = spa->spa_feat_stats;
 
 	if (features != NULL) {
 		spa_feature_stats_from_cache(spa, features);
 	} else {
 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
 		spa->spa_feat_stats = features;
 		spa_feature_stats_from_disk(spa, features);
 	}
 
 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features));
 
 	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			fnvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
 
 			fnvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_approx_errlog_size(spa));
 
 			if (spa_suspended(spa)) {
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode);
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED_REASON,
 				    spa->spa_suspended);
 			}
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatenating with the
 		 * current dev list.
 		 */
 		VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs));
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			newdevs[i] = fnvlist_dup(olddevs[i]);
 		for (i = 0; i < ndevs; i++)
 			newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
 
 		fnvlist_remove(sav->sav_config, config);
 
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		sav->sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)devs, ndevs);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Verify encryption parameters for spa creation. If we are encrypting, we must
  * have the encryption feature flag enabled.
  */
 static int
 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
     boolean_t has_encryption)
 {
 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
 	    !has_encryption)
 		return (SET_ERROR(ENOTSUP));
 
 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj, ndraid = 0;
 	boolean_t has_features;
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
 	spa_feature_t feat;
 	const char *feat_name;
 	const char *poolname;
 	nvlist_t *nvl;
 
 	if (props == NULL ||
 	    nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(poolname) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(poolname, nvl, altroot);
 	fnvlist_free(nvl);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Temporary pool names should never be written to disk.
 	 */
 	if (poolname != pool)
 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
 
 	has_features = B_FALSE;
 	has_encryption = B_FALSE;
 	has_allocclass = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem))) {
 			has_features = B_TRUE;
 
 			feat_name = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
 			if (feat == SPA_FEATURE_ENCRYPTION)
 				has_encryption = B_TRUE;
 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
 				has_allocclass = B_TRUE;
 		}
 	}
 
 	/* verify encryption params, if they were provided */
 	if (dcp != NULL) {
 		error = spa_create_check_encryption_params(dcp, has_encryption);
 		if (error != 0) {
 			spa_deactivate(spa);
 			spa_remove(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (error);
 		}
 	}
 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (ENOTSUP);
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_load_state = SPA_LOAD_CREATE;
 	spa->spa_removing_phys.sr_state = DSS_NONE;
 	spa->spa_removing_phys.sr_removing_vdev = -1;
 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
 		/*
 		 * instantiate the metaslab groups (this will dirty the vdevs)
 		 * we can no longer error exit past this point
 		 */
 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 
 			vdev_metaslab_set_size(vd);
 			vdev_expand(vd, txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP));
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 	/*
 	 * Create BRT table and BRT table object.
 	 */
 	brt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
 		spa_history_create_obj(spa, tx);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 	spa_history_log_version(spa, "create", tx);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Generate some random noise for salted checksums to operate on.
 	 */
 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 	spa->spa_dedup_table_quota =
 	    zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	for (int i = 0; i < ndraid; i++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(dp);
 	mmp_thread_start(spa);
 	txg_wait_synced(dp, txg);
 
 	spa_spawn_aux_threads(spa);
 
 	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	spa->spa_load_state = SPA_LOAD_NONE;
 
 	spa_import_os(spa);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_load_policy_t policy;
 	spa_mode_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = SPA_MODE_READ;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_load_policy(config, &policy);
 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
 	if (state != SPA_LOAD_RECOVER) {
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		zfs_dbgmsg("spa_import: importing %s", pool);
 	} else {
 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
 	}
 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			fnvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES);
 		else
 			spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 		spa->spa_spares.sav_label_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			fnvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE);
 		else
 			spa->spa_l2cache.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 		spa->spa_l2cache.sav_label_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	spa_history_log_version(spa, "import", NULL);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 	mutex_exit(&spa_namespace_lock);
 
 	zvol_create_minors_recursive(pool);
 
 	spa_import_os(spa);
 
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	const char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 	zpool_load_policy_t policy;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
 	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
 
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(name, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
 	kmem_free(name, MAXPATHLEN);
 
 	/*
 	 * Rewind pool if a max txg was provided.
 	 */
 	zpool_get_load_policy(spa->spa_config, &policy);
 	if (policy.zlp_txg != UINT64_MAX) {
 		spa->spa_load_max_txg = policy.zlp_txg;
 		spa->spa_extreme_rewind = B_TRUE;
 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
 		    poolname, (longlong_t)policy.zlp_txg);
 	} else {
 		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
 	}
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
 	    == 0) {
 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 	} else {
 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 	}
 
 	/*
 	 * spa_import() relies on a pool config fetched by spa_try_import()
 	 * for spare/cache devices. Import flags are not passed to
 	 * spa_tryimport(), which makes it return early due to a missing log
 	 * device and missing retrieving the cache device and spare eventually.
 	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
 	 * the correct configuration regardless of the missing log device.
 	 */
 	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
 
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp);
 		fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
 		    spa->spa_errata);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname;
 
 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
 				    dsname);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	int error = 0;
 	spa_t *spa;
 	hrtime_t export_start = gethrtime();
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_is_exporting) {
 		/* the pool is being exported by another thread */
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 	}
 	spa->spa_is_exporting = B_TRUE;
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks
 	 * and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	if (spa->spa_zvol_taskq) {
 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 		taskq_wait(spa->spa_zvol_taskq);
 	}
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_export_thread = curthread;
 	spa_close(spa, FTAG);
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		mutex_exit(&spa_namespace_lock);
 		goto export_spa;
 	}
 
 	/*
 	 * The pool will be in core if it's openable, in which case we can
 	 * modify its state.  Objsets may be open only because they're dirty,
 	 * so we have to force it to sync before checking spa_refcnt.
 	 */
 	if (spa->spa_sync_on) {
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 		spa_evicting_os_wait(spa);
 	}
 
 	/*
 	 * A pool cannot be exported or destroyed if there are active
 	 * references.  If we are resetting a pool, allow references by
 	 * fault injection handlers.
 	 */
 	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
 		error = SET_ERROR(EBUSY);
 		goto fail;
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	/*
 	 * At this point we no longer hold the spa_namespace_lock and
 	 * there were no references on the spa. Future spa_lookups will
 	 * notice the spa->spa_export_thread and wait until we signal
 	 * that we are finshed.
 	 */
 
 	if (spa->spa_sync_on) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			error = SET_ERROR(EXDEV);
 			mutex_enter(&spa_namespace_lock);
 			goto fail;
 		}
 
 		/*
 		 * We're about to export or destroy this pool. Make sure
 		 * we stop all initialization and trim activity here before
 		 * we set the spa_final_txg. This will ensure that all
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
 		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_all(spa);
 		vdev_rebuild_stop_all(spa);
 		l2arc_spa_rebuild_stop(spa);
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			vdev_config_dirty(rvd);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time. This has to be
 		 * done before we set the spa_final_txg, otherwise
 		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
 		 * spa_should_flush_logs_on_unload() should be called after
 		 * spa_state has been set to the new_state.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 export_spa:
 	spa_export_os(spa);
 
 	if (new_state == POOL_STATE_DESTROYED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 	else if (new_state == POOL_STATE_EXPORTED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		*oldconfig = fnvlist_dup(spa->spa_config);
 
 	if (new_state == POOL_STATE_EXPORTED)
 		zio_handle_export_delay(spa, gethrtime() - export_start);
 
 	/*
 	 * Take the namespace lock for the actual spa_t removal
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 		spa_remove(spa);
 	} else {
 		/*
 		 * If spa_remove() is not called for this spa_t and
 		 * there is any possibility that it can be reused,
 		 * we make sure to reset the exporting flag.
 		 */
 		spa->spa_is_exporting = B_FALSE;
 		spa->spa_export_thread = NULL;
 	}
 
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
 	cv_broadcast(&spa_namespace_cv);
 	mutex_exit(&spa_namespace_lock);
 	return (0);
 
 fail:
 	spa->spa_is_exporting = B_FALSE;
 	spa->spa_export_thread = NULL;
 
 	spa_async_resume(spa);
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
 	cv_broadcast(&spa_namespace_cv);
 	mutex_exit(&spa_namespace_lock);
 	return (error);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * This is called as a synctask to increment the draid feature flag
  */
 static void
 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	int draid = (int)(uintptr_t)arg;
 
 	for (int c = 0; c < draid; c++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 }
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * The virtual dRAID spares must be added after vdev tree is created
 	 * and the vdev guids are generated.  The guid of their associated
 	 * dRAID is stored in the config and used when opening the spare.
 	 */
 	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
 	    rvd->vdev_children)) == 0) {
 		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
 			nspares = 0;
 	} else {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * If we are in the middle of a device removal, we can only add
 	 * devices which match the existing devices in the pool.
 	 * If we are in the middle of a removal, or have some indirect
 	 * vdevs, we can not add raidz or dRAID top levels.
 	 */
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
 			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz or a dRAID */
 			if (vdev_get_nparity(tvd) != 0)
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 			/*
 			 * Need the top level mirror to be
 			 * a mirror of leaf vdevs only
 			 */
 			if (tvd->vdev_ops == &vdev_mirror_ops) {
 				for (uint64_t cid = 0;
 				    cid < tvd->vdev_children; cid++) {
 					vdev_t *cvd = tvd->vdev_child[cid];
 					if (!cvd->vdev_ops->vdev_op_leaf) {
 						return (spa_vdev_exit(spa, vd,
 						    txg, EINVAL));
 					}
 				}
 			}
 		}
 	}
 
 	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg,
 				    ZFS_ERR_ASHIFT_MISMATCH));
 			}
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We can't increment a feature while holding spa_vdev so we
 	 * have to do it in a synctask.
 	 */
 	if (ndraid != 0) {
 		dmu_tx_t *tx;
 
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
 		    (void *)(uintptr_t)ndraid, tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a vdev specified by its guid.  The vdev type can be
  * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
  * single device). When the vdev is a single device, a mirror vdev will be
  * automatically inserted.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  *
  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
  * should be performed instead of traditional healing reconstruction.  From
  * an administrators perspective these are both resilver operations.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
     int rebuild)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare = B_FALSE;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (rebuild) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
 		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RESILVER_IN_PROGRESS));
 		}
 	} else {
 		if (vdev_rebuild_active(rvd))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_REBUILD_IN_PROGRESS));
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		return (spa_vdev_exit(spa, NULL, txg,
 		    ZFS_ERR_DEVRM_IN_PROGRESS));
 	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
 
 	if (raidz) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		/*
 		 * Can't expand a raidz while prior expand is in progress.
 		 */
 		if (spa->spa_raidz_expand != NULL) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
 		}
 	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 	}
 
 	if (raidz)
 		pvd = oldvd;
 	else
 		pvd = oldvd->vdev_parent;
 
 	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * log, dedup and special vdevs should not be replaced by spares.
 	 */
 	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
 	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	/*
 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
 	 */
 	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
 	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	if (rebuild) {
 		/*
 		 * For rebuilds, the top vdev must support reconstruction
 		 * using only space maps.  This means the only allowable
 		 * vdevs types are the root vdev, a mirror, or dRAID.
 		 */
 		tvd = pvd;
 		if (pvd->vdev_top != NULL)
 			tvd = pvd->vdev_top;
 
 		if (tvd->vdev_ops != &vdev_mirror_ops &&
 		    tvd->vdev_ops != &vdev_root_ops &&
 		    tvd->vdev_ops != &vdev_draid_ops) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 	}
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or
 		 * the root vdev. A raidz vdev can be attached to, but
 		 * you cannot attach to a raidz child.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops &&
 		    !raidz)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
 	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
 		return (spa_vdev_exit(spa, newrootvd, txg,
 		    ZFS_ERR_ASHIFT_MISMATCH));
 	}
 
 	/*
 	 * RAIDZ-expansion-specific checks.
 	 */
 	if (raidz) {
 		if (vdev_raidz_attach_check(newvd) != 0)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * Fail early if a child is not healthy or being replaced
 		 */
 		for (int i = 0; i < oldvd->vdev_children; i++) {
 			if (vdev_is_dead(oldvd->vdev_child[i]) ||
 			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    ENXIO));
 			}
 			/* Also fail if reserved boot area is in-use */
 			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
 			    != 0) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    EADDRINUSE));
 			}
 		}
 	}
 
 	if (raidz) {
 		/*
 		 * Note: oldvdpath is freed by spa_strfree(),  but
 		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
 		 * move it to a spa_strdup-ed string.
 		 */
 		char *tmp = kmem_asprintf("raidz%u-%u",
 		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
 		oldvdpath = spa_strdup(tmp);
 		kmem_strfree(tmp);
 	} else {
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 	newvdpath = spa_strdup(newvd->vdev_path);
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvdpath, newvdpath) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/old",
 		    newvdpath);
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 		spa_strfree(oldvdpath);
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (!raidz && pvd->vdev_ops != pvops) {
 		pvd = vdev_add_parent(oldvd, pvops);
 		ASSERT(pvd->vdev_ops == pvops);
 		ASSERT(oldvd->vdev_parent == pvd);
 	}
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(pvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	if (raidz) {
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_wait(tvd);
 
 		dtl_max_txg = spa_vdev_config_enter(spa);
 
 		tvd->vdev_rz_expanding = B_TRUE;
 
 		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
 		vdev_config_dirty(tvd);
 
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 		    dtl_max_txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
 		    newvd, tx);
 		dmu_tx_commit(tx);
 	} else {
 		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 		    dtl_max_txg - TXG_INITIAL);
 
 		if (newvd->vdev_isspare) {
 			spa_spare_activate(newvd);
 			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 		}
 
 		newvd_isspare = newvd->vdev_isspare;
 
 		/*
 		 * Mark newvd's DTL dirty in this txg.
 		 */
 		vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 		/*
 		 * Schedule the resilver or rebuild to restart in the future.
 		 * We do this to ensure that dmu_sync-ed blocks have been
 		 * stitched into the respective datasets.
 		 */
 		if (rebuild) {
 			newvd->vdev_rebuild_txg = txg;
 
 			vdev_rebuild(tvd);
 		} else {
 			newvd->vdev_resilver_txg = txg;
 
 			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
 			    spa_feature_is_enabled(spa,
 			    SPA_FEATURE_RESILVER_DEFER)) {
 				vdev_defer_resilver(newvd);
 			} else {
 				dsl_scan_restart_resilver(spa->spa_dsl_pool,
 				    dtl_max_txg);
 			}
 		}
 	}
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing or a spare vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_detach_enter(spa, guid);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	/*
 	 * Besides being called directly from the userland through the
 	 * ioctl interface, spa_vdev_detach() can be potentially called
 	 * at the end of spa_vdev_resilver_done().
 	 *
 	 * In the regular case, when we have a checkpoint this shouldn't
 	 * happen as we never empty the DTLs of a vdev during the scrub
 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
 	 * should never get here when we have a checkpoint.
 	 *
 	 * That said, even in a case when we checkpoint the pool exactly
 	 * as spa_vdev_resilver_done() calls this function everything
 	 * should be fine as the resilver will return right away.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a normal spare, then it
 	 * implies that the spare should become a real disk, and be removed
 	 * from the active spare list for the pool.  dRAID spares on the
 	 * other hand are coupled to the pool and thus should never be removed
 	 * from the spares list.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
 		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 		if (last_cvd->vdev_isspare &&
 		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
 			unspare = B_TRUE;
 		}
 	}
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 	spa_notify_waiters(spa);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 static int
 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 	mutex_enter(&vd->vdev_initialize_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate an initialize action we check to see
 	 * if the vdev_initialize_thread is NULL. We do this instead
 	 * of using the vdev_initialize_state since there might be
 	 * a previous initialization process which has completed but
 	 * the thread is not exited.
 	 */
 	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
 	    vd->vdev_initialize_thread != NULL) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	}
 
 	switch (cmd_type) {
 	case POOL_INITIALIZE_START:
 		vdev_initialize(vd);
 		break;
 	case POOL_INITIALIZE_CANCEL:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
 		break;
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
 	case POOL_INITIALIZE_UNINIT:
 		vdev_uninitialize(vd);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	return (0);
 }
 
 int
 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping initialization. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the initializing operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
 		    &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all initialize threads to stop. */
 	vdev_initialize_stop_wait(spa, &vd_list);
 
 	/* Sync out the initializing state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 static int
 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	} else if (!vd->vdev_has_trim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	} else if (secure && !vd->vdev_has_securetrim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	}
 	mutex_enter(&vd->vdev_trim_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate a TRIM action we check to see if the
 	 * vdev_trim_thread is NULL. We do this instead of using the
 	 * vdev_trim_state since there might be a previous TRIM process
 	 * which has completed but the thread is not exited.
 	 */
 	if (cmd_type == POOL_TRIM_START &&
 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
 	    vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_TRIM_CANCEL &&
 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	}
 
 	switch (cmd_type) {
 	case POOL_TRIM_START:
 		vdev_trim(vd, rate, partial, secure);
 		break;
 	case POOL_TRIM_CANCEL:
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
 		break;
 	case POOL_TRIM_SUSPEND:
 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	return (0);
 }
 
 /*
  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
  * TRIM threads for each child vdev.  These threads pass over all of the free
  * space in the vdev's metaslabs and issues TRIM commands for that space.
  */
 int
 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping TRIM. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the TRIM operation.
 	 */
 	mutex_enter(&spa_namespace_lock);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
 		    rate, partial, secure, &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all TRIM threads to stop. */
 	vdev_trim_stop_wait(spa, &vd_list);
 
 	/* Sync out the TRIM state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	const char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_reset_logs(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
 		    !vdev_is_concrete(vd))) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* deal with indirect vdevs */
 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
 		    &vdev_indirect_ops)
 			continue;
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    !vdev_is_concrete(vml[c]) ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c]) ||
 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift);
 
 		/* transfer per-vdev ZAPs */
 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
 
 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    vml[c]->vdev_parent->vdev_top_zap));
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL));
 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 	/*
 	 * Temporarily stop the initializing and TRIM activity.  We set the
 	 * state to ACTIVE so that we know to resume initializing or TRIM
 	 * once the split has completed.
 	 */
 	list_t vd_initialize_list;
 	list_create(&vd_initialize_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	list_t vd_trim_list;
 	list_create(&vd_trim_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			mutex_enter(&vml[c]->vdev_initialize_lock);
 			vdev_initialize_stop(vml[c],
 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 			mutex_exit(&vml[c]->vdev_initialize_lock);
 
 			mutex_enter(&vml[c]->vdev_trim_lock);
 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
 			mutex_exit(&vml[c]->vdev_trim_lock);
 		}
 	}
 
 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
 	vdev_trim_stop_wait(spa, &vd_trim_list);
 
 	list_destroy(&vd_initialize_list);
 	list_destroy(&vd_trim_list);
 
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 	newspa->spa_is_splitting = B_TRUE;
 
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		newspa->spa_config_splitting = fnvlist_alloc();
 		fnvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			vdev_t *tvd = vml[c]->vdev_top;
 
 			/*
 			 * Need to be sure the detachable VDEV is not
 			 * on any *other* txg's DTL list to prevent it
 			 * from being accessed after it's freed.
 			 */
 			for (int t = 0; t < TXG_SIZE; t++) {
 				(void) txg_list_remove_this(
 				    &tvd->vdev_dtl_list, vml[c], t);
 			}
 
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 
 			vdev_free(vml[c]);
 		}
 	}
 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	newspa->spa_is_splitting = B_FALSE;
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 
 	/* restart initializing or trimming disks as necessary */
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 * Also potentially update faulted state.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		vdev_propagate_state(vd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If a detach was not performed above replace waiters will not have
 	 * been notified.  In which case we must do so now.
 	 */
 	spa_notify_waiters(spa);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 static int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 int
 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
 }
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	return (spa_scan_range(spa, func, 0, 0));
 }
 
 int
 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart,
     uint64_t txgend)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	if (func == POOL_SCAN_RESILVER &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
 		return (SET_ERROR(ENOTSUP));
 
 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
 		return (SET_ERROR(ENOTSUP));
 
 	return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 
 		/* Tell userspace that the vdev is gone. */
 		zfs_post_remove(spa, vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_fault_wanted) {
 		vd->vdev_fault_wanted = B_FALSE;
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_fault_vdev(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 }
 
 static __attribute__((noreturn)) void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks = 0;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
 		old_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		new_space += metaslab_class_get_space(spa_special_class(spa));
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), (u_longlong_t)new_space,
 			    (u_longlong_t)(new_space - old_space));
 		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be marked faulted.
 	 */
 	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_fault_vdev(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
 	    tasks & SPA_ASYNC_REBUILD_DONE ||
 	    tasks & SPA_ASYNC_DETACH_SPARE) {
 		spa_vdev_resilver_done(spa);
 	}
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER &&
 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
 	    (!dsl_scan_resilvering(dp) ||
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 		dsl_scan_restart_resilver(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache whole device TRIM.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_l2arc(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Kick off L2 cache rebuilding.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
 		mutex_enter(&spa_namespace_lock);
 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 		l2arc_spa_rebuild_start(spa);
 		spa_config_exit(spa, SCL_L2ARC, FTAG);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 
 	spa_vdev_remove_suspend(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_cancel(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_cancel(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_cancel(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_cancel(ll_condense_thread);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 	spa_restart_removal(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_resume(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_resume(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_resume(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_resume(ll_condense_thread);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 int
 spa_async_tasks(spa_t *spa)
 {
 	return (spa->spa_async_tasks);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, bp_freed, tx);
 	return (0);
 }
 
 int
 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
 }
 
 int
 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *pio = arg;
 
 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
 	    pio->io_flags));
 	return (0);
 }
 
 static int
 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (spa_free_sync_cb(arg, bp, tx));
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	/*
 	 * Note:
 	 * If the log space map feature is active, we stop deferring
 	 * frees to the next TXG and therefore running this function
 	 * would be considered a no-op as spa_deferred_bpobj should
 	 * not have any entries.
 	 *
 	 * That said we run this function anyway (instead of returning
 	 * immediately) for the edge-case scenario where we just
 	 * activated the log space map feature in this TXG but we have
 	 * deferred frees from the previous TXG.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = vmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	memset(packed + nvsize, 0, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	vmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	nvroot = fnvlist_alloc();
 	if (sav->sav_count == 0) {
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)NULL, 0);
 	} else {
 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)list, sav->sav_count);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 /*
  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
  * The all-vdev ZAP must be empty.
  */
 static void
 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_root_zap != 0 &&
 	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_root_zap, tx));
 	}
 	if (vd->vdev_top_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_top_zap, tx));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_leaf_zap, tx));
 	}
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		spa_avz_build(vd->vdev_child[i], avz, tx);
 	}
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	/*
 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
 	 * Similarly, if the pool is being assembled (e.g. after a split), we
 	 * need to rebuild the AVZ although the config may not be dirty.
 	 */
 	if (list_is_empty(&spa->spa_config_dirty_list) &&
 	    spa->spa_avz_action == AVZ_ACTION_NONE)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
 	    spa->spa_all_vdev_zaps != 0);
 
 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
 		/* Make and build the new AVZ */
 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
 
 		/* Diff old AVZ with new one */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t vdzap = za->za_first_integer;
 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
 			    vdzap) == ENOENT) {
 				/*
 				 * ZAP is listed in old AVZ but not in new one;
 				 * destroy it
 				 */
 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
 				    tx));
 			}
 		}
 
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		/* Destroy the old AVZ */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 
 		/* Replace the old AVZ in the dir obj with the new one */
 		VERIFY0(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
 		    sizeof (new_avz), 1, &new_avz, tx));
 
 		spa->spa_all_vdev_zaps = new_avz;
 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		/* Walk through the AVZ and destroy all listed ZAPs */
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t zap = za->za_first_integer;
 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
 		}
 
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		/* Destroy and unlink the AVZ itself */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 		VERIFY0(zap_remove(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
 		spa->spa_all_vdev_zaps = 0;
 	}
 
 	if (spa->spa_all_vdev_zaps == 0) {
 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_VDEV_ZAP_MAP, tx);
 	}
 	spa->spa_avz_action = AVZ_ACTION_NONE;
 
 	/* Create ZAPs for vdevs that don't have them. */
 	vdev_construct_zaps(spa->spa_root_vdev, tx);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld",
 	    (longlong_t)version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		const char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		const char *elemname = nvpair_name(elem);
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(elemname)) {
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced separately before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persistent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  We also need to
 			 * update the cache file to keep it in sync with the
 			 * MOS version. It's unnecessary to do this for pool
 			 * creation since the vdev's configuration has already
 			 * been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", elemname, strval);
 			break;
 		case ZPOOL_PROP_COMPATIBILITY:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_compatibility != NULL)
 				spa_strfree(spa->spa_compatibility);
 			spa->spa_compatibility = spa_strdup(strval);
 			/*
 			 * Dirty the configuration on vdevs as above.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 
 		case ZPOOL_PROP_INVAL:
 			if (zpool_prop_feature(elemname)) {
 				fname = strchr(elemname, '@') + 1;
 				VERIFY0(zfeature_lookup_name(fname, &fid));
 
 				spa_feature_enable(spa, fid, tx);
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=enabled", elemname);
 				break;
 			} else if (!zfs_prop_user(elemname)) {
 				ASSERT(zpool_prop_feature(elemname));
 				break;
 			}
 			zfs_fallthrough;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			if (prop == ZPOOL_PROP_INVAL) {
 				propname = elemname;
 				proptype = PROP_TYPE_STRING;
 			} else {
 				propname = zpool_prop_to_name(prop);
 				proptype = zpool_prop_get_type(prop);
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", elemname, strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", elemname,
 				    (longlong_t)intval);
 
 				switch (prop) {
 				case ZPOOL_PROP_DELEGATION:
 					spa->spa_delegation = intval;
 					break;
 				case ZPOOL_PROP_BOOTFS:
 					spa->spa_bootfs = intval;
 					break;
 				case ZPOOL_PROP_FAILUREMODE:
 					spa->spa_failmode = intval;
 					break;
 				case ZPOOL_PROP_AUTOTRIM:
 					spa->spa_autotrim = intval;
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOTRIM_RESTART);
 					break;
 				case ZPOOL_PROP_AUTOEXPAND:
 					spa->spa_autoexpand = intval;
 					if (tx->tx_txg != TXG_INITIAL)
 						spa_async_request(spa,
 						    SPA_ASYNC_AUTOEXPAND);
 					break;
 				case ZPOOL_PROP_MULTIHOST:
 					spa->spa_multihost = intval;
 					break;
 				case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
 					spa->spa_dedup_table_quota = intval;
 					break;
 				default:
 					break;
 				}
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 
 	/*
 	 * If we haven't written the salt, do so now.  Note that the
 	 * feature may not be activated yet, but that's fine since
 	 * the presence of this ZAP entry is backwards compatible.
 	 */
 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
 		    spa->spa_cksum_salt.zcs_bytes, tx));
 	}
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 static void
 vdev_indirect_state_sync_verify(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		ASSERT(vim != NULL);
 		ASSERT(vib != NULL);
 	}
 
 	uint64_t obsolete_sm_object = 0;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
 		    space_map_allocated(vd->vdev_obsolete_sm));
 	}
 	ASSERT(vd->vdev_obsolete_segments != NULL);
 
 	/*
 	 * Since frees / remaps to an indirect vdev can only
 	 * happen in syncing context, the obsolete segments
 	 * tree must be empty when we start syncing.
 	 */
 	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
 }
 
 /*
  * Set the top-level vdev's max queue depth. Evaluate each top-level's
  * async write queue depth in case it changed. The max queue depth will
  * not change in the middle of syncing out this txg.
  */
 static void
 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
 	    zfs_vdev_queue_depth_pct / 100;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	metaslab_class_t *special = spa_special_class(spa);
 	metaslab_class_t *dedup = spa_dedup_class(spa);
 
 	uint64_t slots_per_allocator = 0;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		metaslab_group_t *mg = tvd->vdev_mg;
 		if (mg == NULL || !metaslab_group_initialized(mg))
 			continue;
 
 		metaslab_class_t *mc = mg->mg_class;
 		if (mc != normal && mc != special && mc != dedup)
 			continue;
 
 		/*
 		 * It is safe to do a lock-free check here because only async
 		 * allocations look at mg_max_alloc_queue_depth, and async
 		 * allocations all happen from spa_sync().
 		 */
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			ASSERT0(zfs_refcount_count(
 			    &(mg->mg_allocator[i].mga_alloc_queue_depth)));
 		}
 		mg->mg_max_alloc_queue_depth = max_queue_depth;
 
 		for (int i = 0; i < mg->mg_allocators; i++) {
 			mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
 			    zfs_vdev_def_queue_depth;
 		}
 		slots_per_allocator += zfs_vdev_def_queue_depth;
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
 		    mca_alloc_slots));
 		ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
 		    mca_alloc_slots));
 		ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
 		    mca_alloc_slots));
 		normal->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 		special->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 		dedup->mc_allocator[i].mca_alloc_max_slots =
 		    slots_per_allocator;
 	}
 	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 }
 
 static void
 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
 
 		if (vdev_indirect_should_condense(vd)) {
 			spa_condense_indirect_start_sync(vd, tx);
 			break;
 		}
 	}
 }
 
 static void
 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	uint64_t txg = tx->tx_txg;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free ||
 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 			/*
 			 * If the log space map feature is active we don't
 			 * care about deferred frees and the deferred bpobj
 			 * as the log space map should effectively have the
 			 * same results (i.e. appending only to one object).
 			 */
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 		dsl_errorscrub_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
 		spa_flush_metaslabs(spa, tx);
 
 		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
 			vdev_sync(vd, txg);
 
 		if (pass == 1) {
 			/*
 			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
 			 * the config. If that happens, this txg should not
 			 * be a no-op. So we must sync the config to the MOS
 			 * before checking for no-op.
 			 *
 			 * Note that when the config is dirty, it will
 			 * be written to the MOS (i.e. the MOS will be
 			 * dirtied) every time we call spa_sync_config_object()
 			 * in this txg.  Therefore we can't call this after
 			 * dsl_pool_sync() every pass, because it would
 			 * prevent us from converging, since we'd dirty
 			 * the MOS every pass.
 			 *
 			 * Sync tasks can only be processed in pass 1, so
 			 * there's no need to do this in later passes.
 			 */
 			spa_sync_config_object(spa, tx);
 		}
 
 		/*
 		 * Note: We need to check if the MOS is dirty because we could
 		 * have marked the MOS dirty without updating the uberblock
 		 * (e.g. if we have sync tasks but no dirty user data). We need
 		 * to check the uberblock's rootbp because it is updated if we
 		 * have synced out dirty data (though in this case the MOS will
 		 * most likely also be dirty due to second order effects, we
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
 		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
 			 * TXG is a no-op. Avoid syncing deferred frees, so
 			 * that we can keep this TXG as a no-op.
 			 */
 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
 			break;
 		}
 
 		spa_sync_deferred_frees(spa, tx);
 	} while (dmu_objset_is_dirty(mos, txg));
 }
 
 /*
  * Rewrite the vdev configuration (which includes the uberblock) to
  * commit the transaction group.
  *
  * If there are no dirty vdevs, we sync the uberblock to a few random
  * top-level vdevs that are known to be visible in the config cache
  * (see spa_vdev_add() for a complete description). If there *are* dirty
  * vdevs, sync the uberblock to all vdevs.
  */
 static void
 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg = tx->tx_txg;
 
 	for (;;) {
 		int error = 0;
 
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = random_in_range(children);
 
 			for (int c = 0; c < children; c++) {
 				vdev_t *vd =
 				    rvd->vdev_child[(c0 + c) % children];
 
 				/* Stop when revisiting the first vdev */
 				if (c > 0 && svd[0] == vd)
 					break;
 
 				if (vd->vdev_ms_array == 0 ||
 				    vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
 
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 		zio_resume_wait(spa);
 	}
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	vdev_t *vd = NULL;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Now that there can be no more cloning in this transaction group,
 	 * but we are still before issuing frees, we can process pending BRT
 	 * updates.
 	 */
 	brt_pending_apply(spa, txg);
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_allocs[i].spaa_lock);
 		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 		/* Avoid holding the write lock unless actually necessary */
 		if (vd->vdev_aux == NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 			continue;
 		}
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		int i;
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY0(zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	spa_sync_adjust_vdev_max_queue_depth(spa);
 
 	spa_sync_condense_indirect(spa, tx);
 
 	spa_sync_iterate_to_convergence(spa, tx);
 
 #ifdef ZFS_DEBUG
 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
 	/*
 	 * Make sure that the number of ZAPs for all the vdevs matches
 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
 	 * called if the config is dirty; otherwise there may be
 	 * outstanding AVZ operations that weren't completed in
 	 * spa_sync_config_object.
 	 */
 		uint64_t all_vdev_zap_entry_count;
 		ASSERT0(zap_count(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
 		    all_vdev_zap_entry_count);
 	}
 #endif
 
 	if (spa->spa_vdev_removal != NULL) {
 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 	}
 
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
 	spa->spa_deadman_tqid = 0;
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	dsl_pool_sync_done(dp, txg);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_enter(&spa->spa_allocs[i].spaa_lock);
 		VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
 		mutex_exit(&spa->spa_allocs[i].spaa_lock);
 	}
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
 	/* spa_embedded_log_class has only one metaslab per vdev. */
 	metaslab_class_evict_old(spa->spa_special_class, txg);
 	metaslab_class_evict_old(spa->spa_dedup_class, txg);
 
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
 		vdev_autotrim_kick(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	while (zfs_pause_spa_sync)
 		delay(1);
 
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * Update the last synced uberblock here. We want to do this at
 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
 	 * will be guaranteed that all the processing associated with
 	 * that txg has been completed.
 	 */
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 taskq_t *
 spa_sync_tq_create(spa_t *spa, const char *name)
 {
 	kthread_t **kthreads;
 
 	ASSERT(spa->spa_sync_tq == NULL);
 	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
 
 	/*
 	 * - do not allow more allocators than cpus.
 	 * - there may be more cpus than allocators.
 	 * - do not allow more sync taskq threads than allocators or cpus.
 	 */
 	int nthreads = spa->spa_alloc_count;
 	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
 	    nthreads, KM_SLEEP);
 
 	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
 	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
 	VERIFY(spa->spa_sync_tq != NULL);
 	VERIFY(kthreads != NULL);
 
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
 	for (int i = 0; i < nthreads; i++, ti++) {
 		ti->sti_thread = kthreads[i];
 		ti->sti_allocator = i;
 	}
 
 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
 	return (spa->spa_sync_tq);
 }
 
 void
 spa_sync_tq_destroy(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_tq != NULL);
 
 	taskq_wait(spa->spa_sync_tq);
 	taskq_destroy(spa->spa_sync_tq);
 	kmem_free(spa->spa_syncthreads,
 	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
 	spa->spa_sync_tq = NULL;
 }
 
 uint_t
 spa_acq_allocator(spa_t *spa)
 {
 	int i;
 
 	if (spa->spa_alloc_count == 1)
 		return (0);
 
 	mutex_enter(&spa->spa_allocs_use->sau_lock);
 	uint_t r = spa->spa_allocs_use->sau_rotor;
 	do {
 		if (++r == spa->spa_alloc_count)
 			r = 0;
 	} while (spa->spa_allocs_use->sau_inuse[r]);
 	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
 	spa->spa_allocs_use->sau_rotor = r;
 	mutex_exit(&spa->spa_allocs_use->sau_lock);
 
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
 	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
 		if (ti->sti_thread == curthread) {
 			ti->sti_allocator = r;
 			break;
 		}
 	}
 	ASSERT3S(i, <, spa->spa_alloc_count);
 	return (r);
 }
 
 void
 spa_rel_allocator(spa_t *spa, uint_t allocator)
 {
 	if (spa->spa_alloc_count > 1)
 		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
 }
 
 void
 spa_select_allocator(zio_t *zio)
 {
 	zbookmark_phys_t *bm = &zio->io_bookmark;
 	spa_t *spa = zio->io_spa;
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 	/*
 	 * A gang block (for example) may have inherited its parent's
 	 * allocator, in which case there is nothing further to do here.
 	 */
 	if (ZIO_HAS_ALLOCATOR(zio))
 		return;
 
 	ASSERT(spa != NULL);
 	ASSERT(bm != NULL);
 
 	/*
 	 * First try to use an allocator assigned to the syncthread, and set
 	 * the corresponding write issue taskq for the allocator.
 	 * Note, we must have an open pool to do this.
 	 */
 	if (spa->spa_sync_tq != NULL) {
 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
 			if (ti->sti_thread == curthread) {
 				zio->io_allocator = ti->sti_allocator;
 				return;
 			}
 		}
 	}
 
 	/*
 	 * We want to try to use as many allocators as possible to help improve
 	 * performance, but we also want logically adjacent IOs to be physically
 	 * adjacent to improve sequential read performance. We chunk each object
 	 * into 2^20 block regions, and then hash based on the objset, object,
 	 * level, and region to accomplish both of these goals.
 	 */
 	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
 	    bm->zb_blkid >> 20);
 
 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 static boolean_t
 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
 {
 	(void) spa;
 	int i;
 	uint64_t vdev_guid;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &vdev_guid) == 0 && vdev_guid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_has_l2cache(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 uint64_t
 spa_total_metaslabs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	uint64_t m = 0;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		if (!vdev_is_concrete(vd))
 			continue;
 		m += vd->vdev_ms_count;
 	}
 	return (m);
 }
 
 /*
  * Notify any waiting threads that some activity has switched from being in-
  * progress to not-in-progress so that the thread can wake up and determine
  * whether it is finished waiting.
  */
 void
 spa_notify_waiters(spa_t *spa)
 {
 	/*
 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
 	 * happening between the waiting thread's check and cv_wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	cv_broadcast(&spa->spa_activities_cv);
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /*
  * Notify any waiting threads that the pool is exporting, and then block until
  * they are finished using the spa_t.
  */
 void
 spa_wake_waiters(spa_t *spa)
 {
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_TRUE;
 	cv_broadcast(&spa->spa_activities_cv);
 	while (spa->spa_waiters != 0)
 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_FALSE;
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
 static boolean_t
 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
 	    activity == ZPOOL_WAIT_TRIM);
 
 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
 
 	mutex_exit(&spa->spa_activities_lock);
 	mutex_enter(lock);
 	mutex_enter(&spa->spa_activities_lock);
 
 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
 	mutex_exit(lock);
 
 	if (in_progress)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
 		    activity))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * If use_guid is true, this checks whether the vdev specified by guid is
  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
  * is being initialized/trimmed. The caller must hold the config lock and
  * spa_activities_lock.
  */
 static int
 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
     zpool_wait_activity_t activity, boolean_t *in_progress)
 {
 	mutex_exit(&spa->spa_activities_lock);
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	mutex_enter(&spa->spa_activities_lock);
 
 	vdev_t *vd;
 	if (use_guid) {
 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 	} else {
 		vd = spa->spa_root_vdev;
 	}
 
 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
 
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 	return (0);
 }
 
 /*
  * Locking for waiting threads
  * ---------------------------
  *
  * Waiting threads need a way to check whether a given activity is in progress,
  * and then, if it is, wait for it to complete. Each activity will have some
  * in-memory representation of the relevant on-disk state which can be used to
  * determine whether or not the activity is in progress. The in-memory state and
  * the locking used to protect it will be different for each activity, and may
  * not be suitable for use with a cvar (e.g., some state is protected by the
  * config lock). To allow waiting threads to wait without any races, another
  * lock, spa_activities_lock, is used.
  *
  * When the state is checked, both the activity-specific lock (if there is one)
  * and spa_activities_lock are held. In some cases, the activity-specific lock
  * is acquired explicitly (e.g. the config lock). In others, the locking is
  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
  * thread releases the activity-specific lock and, if the activity is in
  * progress, then cv_waits using spa_activities_lock.
  *
  * The waiting thread is woken when another thread, one completing some
  * activity, updates the state of the activity and then calls
  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
  * needs to hold its activity-specific lock when updating the state, and this
  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
  *
  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
  * and because it is held when the waiting thread checks the state of the
  * activity, it can never be the case that the completing thread both updates
  * the activity state and cv_broadcasts in between the waiting thread's check
  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
  *
  * In order to prevent deadlock, when the waiting thread does its check, in some
  * cases it will temporarily drop spa_activities_lock in order to acquire the
  * activity-specific lock. The order in which spa_activities_lock and the
  * activity specific lock are acquired in the waiting thread is determined by
  * the order in which they are acquired in the completing thread; if the
  * completing thread calls spa_notify_waiters with the activity-specific lock
  * held, then the waiting thread must also acquire the activity-specific lock
  * first.
  */
 
 static int
 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 
 	switch (activity) {
 	case ZPOOL_WAIT_CKPT_DISCARD:
 		*in_progress =
 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
 		    zap_contains(spa_meta_objset(spa),
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
 		    ENOENT);
 		break;
 	case ZPOOL_WAIT_FREE:
 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
 		    spa_livelist_delete_check(spa));
 		break;
 	case ZPOOL_WAIT_INITIALIZE:
 	case ZPOOL_WAIT_TRIM:
 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
 		    activity, in_progress);
 		break;
 	case ZPOOL_WAIT_REPLACE:
 		mutex_exit(&spa->spa_activities_lock);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 		mutex_enter(&spa->spa_activities_lock);
 
 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		break;
 	case ZPOOL_WAIT_REMOVE:
 		*in_progress = (spa->spa_removing_phys.sr_state ==
 		    DSS_SCANNING);
 		break;
 	case ZPOOL_WAIT_RESILVER:
 		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
 		if (*in_progress)
 			break;
 		zfs_fallthrough;
 	case ZPOOL_WAIT_SCRUB:
 	{
 		boolean_t scanning, paused, is_scrub;
 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
 
 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
 		paused = dsl_scan_is_paused_scrub(scn);
 		*in_progress = (scanning && !paused &&
 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 		break;
 	}
 	case ZPOOL_WAIT_RAIDZ_EXPAND:
 	{
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
 		break;
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 static int
 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *waited)
 {
 	/*
 	 * The tag is used to distinguish between instances of an activity.
 	 * 'initialize' and 'trim' are the only activities that we use this for.
 	 * The other activities can only have a single instance in progress in a
 	 * pool at one time, making the tag unnecessary.
 	 *
 	 * There can be multiple devices being replaced at once, but since they
 	 * all finish once resilvering finishes, we don't bother keeping track
 	 * of them individually, we just wait for them all to finish.
 	 */
 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
 	    activity != ZPOOL_WAIT_TRIM)
 		return (EINVAL);
 
 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
 		return (EINVAL);
 
 	spa_t *spa;
 	int error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Increment the spa's waiter count so that we can call spa_close and
 	 * still ensure that the spa_t doesn't get freed before this thread is
 	 * finished with it when the pool is exported. We want to call spa_close
 	 * before we start waiting because otherwise the additional ref would
 	 * prevent the pool from being exported or destroyed throughout the
 	 * potentially long wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters++;
 	spa_close(spa, FTAG);
 
 	*waited = B_FALSE;
 	for (;;) {
 		boolean_t in_progress;
 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
 		    &in_progress);
 
 		if (error || !in_progress || spa->spa_waiters_cancel)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&spa->spa_activities_cv,
 		    &spa->spa_activities_lock) == 0) {
 			error = EINTR;
 			break;
 		}
 	}
 
 	spa->spa_waiters--;
 	cv_signal(&spa->spa_waiters_cv);
 	mutex_exit(&spa->spa_activities_lock);
 
 	return (error);
 }
 
 /*
  * Wait for a particular instance of the specified activity to complete, where
  * the instance is identified by 'tag'
  */
 int
 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
     boolean_t *waited)
 {
 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
 }
 
 /*
  * Wait for all instances of the specified activity complete
  */
 int
 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
 {
 
 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
 }
 
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	sysevent_t *ev = NULL;
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
 	if (resource) {
 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 		ev->resource = resource;
 	}
 #else
 	(void) spa, (void) vd, (void) hist_nvl, (void) name;
 #endif
 	return (ev);
 }
 
 void
 spa_event_post(sysevent_t *ev)
 {
 #ifdef _KERNEL
 	if (ev) {
 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 		kmem_free(ev, sizeof (*ev));
 	}
 #else
 	(void) ev;
 #endif
 }
 
 /*
  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
 EXPORT_SYMBOL(spa_get_stats);
 EXPORT_SYMBOL(spa_create);
 EXPORT_SYMBOL(spa_import);
 EXPORT_SYMBOL(spa_tryimport);
 EXPORT_SYMBOL(spa_destroy);
 EXPORT_SYMBOL(spa_export);
 EXPORT_SYMBOL(spa_reset);
 EXPORT_SYMBOL(spa_async_request);
 EXPORT_SYMBOL(spa_async_suspend);
 EXPORT_SYMBOL(spa_async_resume);
 EXPORT_SYMBOL(spa_inject_addref);
 EXPORT_SYMBOL(spa_inject_delref);
 EXPORT_SYMBOL(spa_scan_stat_init);
 EXPORT_SYMBOL(spa_scan_get_stats);
 
 /* device manipulation */
 EXPORT_SYMBOL(spa_vdev_add);
 EXPORT_SYMBOL(spa_vdev_attach);
 EXPORT_SYMBOL(spa_vdev_detach);
 EXPORT_SYMBOL(spa_vdev_setpath);
 EXPORT_SYMBOL(spa_vdev_setfru);
 EXPORT_SYMBOL(spa_vdev_split_mirror);
 
 /* spare statech is global across all pools) */
 EXPORT_SYMBOL(spa_spare_add);
 EXPORT_SYMBOL(spa_spare_remove);
 EXPORT_SYMBOL(spa_spare_exists);
 EXPORT_SYMBOL(spa_spare_activate);
 
 /* L2ARC statech is global across all pools) */
 EXPORT_SYMBOL(spa_l2cache_add);
 EXPORT_SYMBOL(spa_l2cache_remove);
 EXPORT_SYMBOL(spa_l2cache_exists);
 EXPORT_SYMBOL(spa_l2cache_activate);
 EXPORT_SYMBOL(spa_l2cache_drop);
 
 /* scanning */
 EXPORT_SYMBOL(spa_scan);
 EXPORT_SYMBOL(spa_scan_range);
 EXPORT_SYMBOL(spa_scan_stop);
 
 /* spa syncing */
 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
 EXPORT_SYMBOL(spa_sync_allpools);
 
 /* properties */
 EXPORT_SYMBOL(spa_prop_set);
 EXPORT_SYMBOL(spa_prop_get);
 EXPORT_SYMBOL(spa_prop_clear_bootfs);
 
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run a metaslab preload taskq");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
 	"log2 fraction of arc that can be used by inflight I/Os when "
 	"verifying pool during import");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
 	"Set to traverse metadata on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 	"Set to traverse data on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run an IO worker thread");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
 	"Number of threads per IO worker taskqueue");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
 	ZMOD_RW, "Set the livelist condense zthr to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
 	ZMOD_RW, "Set the livelist condense synctask to pause");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the synctask");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the zthr function");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 
 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
 	"Configure IO queues for read IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
 	"Configure IO queues for write IO");
 #endif
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
 	"Number of CPUs per write issue taskq");
diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c
index 1efff47f87a0..4c3721c159be 100644
--- a/module/zfs/spa_checkpoint.c
+++ b/module/zfs/spa_checkpoint.c
@@ -1,640 +1,638 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017 by Delphix. All rights reserved.
  */
 
 /*
  * Storage Pool Checkpoint
  *
  * A storage pool checkpoint can be thought of as a pool-wide snapshot or
  * a stable version of extreme rewind that guarantees no blocks from the
  * checkpointed state will have been overwritten. It remembers the entire
  * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
  * point that it was taken and the user can rewind back to that point even if
  * they applied destructive operations on their datasets or even enabled new
  * zpool on-disk features. If a pool has a checkpoint that is no longer
  * needed, the user can discard it.
  *
  * == On disk data structures used ==
  *
  * - The pool has a new feature flag and a new entry in the MOS. The feature
  *   flag is set to active when we create the checkpoint and remains active
  *   until the checkpoint is fully discarded. The entry in the MOS config
  *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
  *   references the state of the pool when we take the checkpoint. The entry
  *   remains populated until we start discarding the checkpoint or we rewind
  *   back to it.
  *
  * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
  *   which persists until the checkpoint is fully discarded. The space map
  *   contains entries that have been freed in the current state of the pool
  *   but we want to keep around in case we decide to rewind to the checkpoint.
  *   [see vdev_checkpoint_sm]
  *
  * - Each metaslab's ms_sm space map behaves the same as without the
  *   checkpoint, with the only exception being the scenario when we free
  *   blocks that belong to the checkpoint. In this case, these blocks remain
  *   ALLOCATED in the metaslab's space map and they are added as FREE in the
  *   vdev's checkpoint space map.
  *
  * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
  *   the uberblock was checkpointed. For normal uberblocks this field is 0.
  *
  * == Overview of operations ==
  *
  * - To create a checkpoint, we first wait for the current TXG to be synced,
  *   so we can use the most recently synced uberblock (spa_ubsync) as the
  *   checkpointed uberblock. Then we use an early synctask to place that
  *   uberblock in MOS config, increment the feature flag for the checkpoint
  *   (marking it active), and setting spa_checkpoint_txg (see its use below)
  *   to the TXG of the checkpointed uberblock. We use an early synctask for
  *   the aforementioned operations to ensure that no blocks were dirtied
  *   between the current TXG and the TXG of the checkpointed uberblock
  *   (e.g the previous txg).
  *
  * - When a checkpoint exists, we need to ensure that the blocks that
  *   belong to the checkpoint are freed but never reused. This means that
  *   these blocks should never end up in the ms_allocatable or the ms_freeing
  *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
  *   ms_checkpointing tree is used in addition to the aforementioned ones.
  *
  *   Whenever a block is freed and we find out that it is referenced by the
  *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
  *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
  *   This way, we divide the blocks that are being freed into checkpointed
  *   and not-checkpointed blocks.
  *
  *   In order to persist these frees, we write the extents from the
  *   ms_freeingtree to the ms_sm as usual, and the extents from the
  *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
  *   checkpointed extents will remain allocated in the metaslab's ms_sm space
  *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
  *   when we discard the checkpoint, we can find the entries that have
  *   actually been freed in vdev_checkpoint_sm.
  *   [see spa_checkpoint_discard_thread_sync()]
  *
  * - To discard the checkpoint we use an early synctask to delete the
  *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
  *   and wakeup the discarding zthr thread (an open-context async thread).
  *   We use an early synctask to ensure that the operation happens before any
  *   new data end up in the checkpoint's data structures.
  *
  *   Once the synctask is done and the discarding zthr is awake, we discard
  *   the checkpointed data over multiple TXGs by having the zthr prefetching
  *   entries from vdev_checkpoint_sm and then starting a synctask that places
  *   them as free blocks into their respective ms_allocatable and ms_sm
  *   structures.
  *   [see spa_checkpoint_discard_thread()]
  *
  *   When there are no entries left in the vdev_checkpoint_sm of all
  *   top-level vdevs, a final synctask runs that decrements the feature flag.
  *
  * - To rewind to the checkpoint, we first use the current uberblock and
  *   open the MOS so we can access the checkpointed uberblock from the MOS
  *   config. After we retrieve the checkpointed uberblock, we use it as the
  *   current uberblock for the pool by writing it to disk with an updated
  *   TXG, opening its version of the MOS, and moving on as usual from there.
  *   [see spa_ld_checkpoint_rewind()]
  *
  *   An important note on rewinding to the checkpoint has to do with how we
  *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
  *   blocks that have not been claimed by the time we took the checkpoint
  *   as they should no longer be valid.
  *   [see comment in zil_claim()]
  *
  * == Miscellaneous information ==
  *
  * - In the hypothetical event that we take a checkpoint, remove a vdev,
  *   and attempt to rewind, the rewind would fail as the checkpointed
  *   uberblock would reference data in the removed device. For this reason
  *   and others of similar nature, we disallow the following operations that
  *   can change the config:
  *   	vdev removal and attach/detach, mirror splitting, and pool reguid.
  *
  * - As most of the checkpoint logic is implemented in the SPA and doesn't
  *   distinguish datasets when it comes to space accounting, having a
  *   checkpoint can potentially break the boundaries set by dataset
  *   reservations.
  */
 
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/spa_checkpoint.h>
 #include <sys/vdev_impl.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 
 /*
  * The following parameter limits the amount of memory to be used for the
  * prefetching of the checkpoint space map done on each vdev while
  * discarding the checkpoint.
  *
  * The reason it exists is because top-level vdevs with long checkpoint
  * space maps can potentially take up a lot of memory depending on the
  * amount of checkpointed data that has been freed within them while
  * the pool had a checkpoint.
  */
 static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
 
 int
 spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 
 	memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
 
 	int error = zap_contains(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
 	ASSERT(error == 0 || error == ENOENT);
 
 	if (error == ENOENT)
 		pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
 	else
 		pcs->pcs_state = CS_CHECKPOINT_EXISTS;
 
 	pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
 	pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
 
 	return (0);
 }
 
 static void
 spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 
 	spa->spa_checkpoint_info.sci_timestamp = 0;
 
 	spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 	spa_notify_waiters(spa);
 
 	spa_history_log_internal(spa, "spa discard checkpoint", tx,
 	    "finished discarding checkpointed state from the pool");
 }
 
 typedef struct spa_checkpoint_discard_sync_callback_arg {
 	vdev_t *sdc_vd;
 	uint64_t sdc_txg;
 	uint64_t sdc_entry_limit;
 } spa_checkpoint_discard_sync_callback_arg_t;
 
 static int
 spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 {
 	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 	vdev_t *vd = sdc->sdc_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	if (sdc->sdc_entry_limit == 0)
 		return (SET_ERROR(EINTR));
 
 	/*
 	 * Since the space map is not condensed, we know that
 	 * none of its entries is crossing the boundaries of
 	 * its respective metaslab.
 	 *
 	 * That said, there is no fundamental requirement that
 	 * the checkpoint's space map entries should not cross
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
 	VERIFY3U(sme->sme_type, ==, SM_FREE);
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * At this point we should not be processing any
 	 * other frees concurrently, so the lock is technically
 	 * unnecessary. We use the lock anyway though to
 	 * potentially save ourselves from future headaches.
 	 */
 	mutex_enter(&ms->ms_lock);
 	if (range_tree_is_empty(ms->ms_freeing))
 		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
 	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
 	    sme->sme_run);
 	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 
 	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
 	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 	sdc->sdc_entry_limit--;
 
 	return (0);
 }
 
 #ifdef ZFS_DEBUG
 static void
 spa_checkpoint_accounting_verify(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t ckpoint_sm_space_sum = 0;
 	uint64_t vs_ckpoint_space_sum = 0;
 
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (vd->vdev_checkpoint_sm != NULL) {
 			ckpoint_sm_space_sum +=
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vs_ckpoint_space_sum +=
 			    vd->vdev_stat.vs_checkpoint_space;
 			ASSERT3U(ckpoint_sm_space_sum, ==,
 			    vs_ckpoint_space_sum);
 		} else {
 			ASSERT0(vd->vdev_stat.vs_checkpoint_space);
 		}
 	}
 	ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
 }
 #endif
 
 static void
 spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *vd = arg;
 	int error;
 
 	/*
 	 * The space map callback is applied only to non-debug entries.
 	 * Because the number of debug entries is less or equal to the
 	 * number of non-debug entries, we want to ensure that we only
 	 * read what we prefetched from open-context.
 	 *
 	 * Thus, we set the maximum entries that the space map callback
 	 * will be applied to be half the entries that could fit in the
 	 * imposed memory limit.
 	 *
 	 * Note that since this is a conservative estimate we also
 	 * assume the worst case scenario in our computation where each
 	 * entry is two-word.
 	 */
 	uint64_t max_entry_limit =
 	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 
 	/*
 	 * Iterate from the end of the space map towards the beginning,
 	 * placing its entries on ms_freeing and removing them from the
 	 * space map. The iteration stops if one of the following
 	 * conditions is true:
 	 *
 	 * 1] We reached the beginning of the space map. At this point
 	 *    the space map should be completely empty and
 	 *    space_map_incremental_destroy should have returned 0.
 	 *    The next step would be to free and close the space map
 	 *    and remove its entry from its vdev's top zap. This allows
 	 *    spa_checkpoint_discard_thread() to move on to the next vdev.
 	 *
 	 * 2] We reached the memory limit (amount of memory used to hold
 	 *    space map entries in memory) and space_map_incremental_destroy
 	 *    returned EINTR. This means that there are entries remaining
 	 *    in the space map that will be cleared in a future invocation
 	 *    of this function by spa_checkpoint_discard_thread().
 	 */
 	spa_checkpoint_discard_sync_callback_arg_t sdc;
 	sdc.sdc_vd = vd;
 	sdc.sdc_txg = tx->tx_txg;
 	sdc.sdc_entry_limit = max_entry_limit;
 
 	uint64_t words_before =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 	    spa_checkpoint_discard_sync_callback, &sdc, tx);
 
 	uint64_t words_after =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 #ifdef ZFS_DEBUG
 	spa_checkpoint_accounting_verify(vd->vdev_spa);
 #endif
 
 	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, "
 	    "deleted %llu words - %llu words are left",
 	    (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id,
 	    (u_longlong_t)(words_before - words_after),
 	    (u_longlong_t)words_after);
 
 	if (error != EINTR) {
 		if (error != 0) {
 			zfs_panic_recover("zfs: error %lld was returned "
 			    "while incrementally destroying the checkpoint "
 			    "space map of vdev %llu\n",
 			    (longlong_t)error, vd->vdev_id);
 		}
 		ASSERT0(words_after);
 		ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
 		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 
 		space_map_free(vd->vdev_checkpoint_sm, tx);
 		space_map_close(vd->vdev_checkpoint_sm);
 		vd->vdev_checkpoint_sm = NULL;
 
 		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 	}
 }
 
 static boolean_t
 spa_checkpoint_discard_is_done(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(!spa_has_checkpoint(spa));
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
 
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
 			return (B_FALSE);
 		ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
 	}
 
 	return (B_TRUE);
 }
 
 boolean_t
 spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (B_FALSE);
 
 	if (spa_has_checkpoint(spa))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 void
 spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		while (vd->vdev_checkpoint_sm != NULL) {
 			space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
 			int numbufs;
 			dmu_buf_t **dbp;
 
 			if (zthr_iscancelled(zthr))
 				return;
 
 			ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 
 			uint64_t size = MIN(space_map_length(checkpoint_sm),
 			    zfs_spa_discard_memory_limit);
 			uint64_t offset =
 			    space_map_length(checkpoint_sm) - size;
 
 			/*
 			 * Ensure that the part of the space map that will
 			 * be destroyed by the synctask, is prefetched in
 			 * memory before the synctask runs.
 			 */
 			int error = dmu_buf_hold_array_by_bonus(
 			    checkpoint_sm->sm_dbuf, offset, size,
 			    B_TRUE, FTAG, &numbufs, &dbp);
 			if (error != 0) {
 				zfs_panic_recover("zfs: error %d was returned "
 				    "while prefetching checkpoint space map "
 				    "entries of vdev %llu\n",
 				    error, vd->vdev_id);
 			}
 
 			VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 			    spa_checkpoint_discard_thread_sync, vd,
 			    0, ZFS_SPACE_CHECK_NONE));
 
 			dmu_buf_rele_array(dbp, numbufs, FTAG);
 		}
 	}
 
 	VERIFY(spa_checkpoint_discard_is_done(spa));
 	VERIFY0(spa->spa_checkpoint_info.sci_dspace);
 	VERIFY0(dsl_sync_task(spa->spa_name, NULL,
 	    spa_checkpoint_discard_complete_sync, spa,
 	    0, ZFS_SPACE_CHECK_NONE));
 }
 
 
 static int
 spa_checkpoint_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (SET_ERROR(ENOTSUP));
 
 	if (!spa_top_vdevs_spacemap_addressable(spa))
 		return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
 
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
 		return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
 
 	if (spa->spa_raidz_expand != NULL)
 		return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
 
 	if (spa->spa_checkpoint_txg != 0)
 		return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 
 	return (0);
 }
 
 static void
 spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	uberblock_t checkpoint = spa->spa_ubsync;
 
 	/*
 	 * At this point, there should not be a checkpoint in the MOS.
 	 */
 	ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
 
 	ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
 	ASSERT0(spa->spa_checkpoint_info.sci_dspace);
 
 	/*
 	 * Since the checkpointed uberblock is the one that just got synced
 	 * (we use spa_ubsync), its txg must be equal to the txg number of
 	 * the txg we are syncing, minus 1.
 	 */
 	ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
 
 	/*
 	 * Once the checkpoint is in place, we need to ensure that none of
 	 * its blocks will be marked for reuse after it has been freed.
 	 * When there is a checkpoint and a block is freed, we compare its
 	 * birth txg to the txg of the checkpointed uberblock to see if the
 	 * block is part of the checkpoint or not. Therefore, we have to set
 	 * spa_checkpoint_txg before any frees happen in this txg (which is
 	 * why this is done as an early_synctask as explained in the comment
 	 * in spa_checkpoint()).
 	 */
 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 
 	checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
 	    sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
 	    &checkpoint, tx));
 
 	/*
 	 * Increment the feature refcount and thus activate the feature.
 	 * Note that the feature will be deactivated when we've
 	 * completely discarded all checkpointed state (both vdev
 	 * space maps and uberblock).
 	 */
 	spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
 
 	spa_history_log_internal(spa, "spa checkpoint", tx,
 	    "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
 }
 
 /*
  * Create a checkpoint for the pool.
  */
 int
 spa_checkpoint(const char *pool)
 {
 	int error;
 	spa_t *spa;
 
 	error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 
 	/*
 	 * Wait for current syncing txg to finish so the latest synced
 	 * uberblock (spa_ubsync) has all the changes that we expect
 	 * to see if we were to revert later to the checkpoint. In other
 	 * words we want the checkpointed uberblock to include/reference
 	 * all the changes that were pending at the time that we issued
 	 * the checkpoint command.
 	 */
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * As the checkpointed uberblock references blocks from the previous
 	 * txg (spa_ubsync) we want to ensure that are not freeing any of
 	 * these blocks in the same txg that the following synctask will
 	 * run. Thus, we run it as an early synctask, so the dirty changes
 	 * that are synced to disk afterwards during zios and other synctasks
 	 * do not reuse checkpointed blocks.
 	 */
 	error = dsl_early_sync_task(pool, spa_checkpoint_check,
 	    spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
 
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
 
 	if (spa->spa_checkpoint_txg == 0)
 		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
 
 	VERIFY0(zap_contains(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
 
 	return (0);
 }
 
 static void
 spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, tx));
 
 	spa->spa_checkpoint_txg = 0;
 
 	zthr_wakeup(spa->spa_checkpoint_discard_zthr);
 
 	spa_history_log_internal(spa, "spa discard checkpoint", tx,
 	    "started discarding checkpointed state from the pool");
 }
 
 /*
  * Discard the checkpoint from a pool.
  */
 int
 spa_checkpoint_discard(const char *pool)
 {
 	/*
 	 * Similarly to spa_checkpoint(), we want our synctask to run
 	 * before any pending dirty data are written to disk so they
 	 * won't end up in the checkpoint's data structures (e.g.
 	 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
 	 * space maps that the discarding open-context thread has
 	 * deleted.
 	 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
 	 */
 	return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
 	    spa_checkpoint_discard_sync, NULL, 0,
 	    ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
 }
 
 EXPORT_SYMBOL(spa_checkpoint_get_stats);
 EXPORT_SYMBOL(spa_checkpoint_discard_thread);
 EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW,
 	"Limit for memory used in prefetching the checkpoint space map done "
 	"on each vdev while discarding the checkpoint");
-/* END CSTYLED */
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index a49e28ee7a43..18b3970ac0dc 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -1,1498 +1,1496 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2014, Delphix. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2021, 2022, George Amanakis. All rights reserved.
  */
 
 /*
  * Routines to manage the on-disk persistent error log.
  *
  * Each pool stores a log of all logical data errors seen during normal
  * operation.  This is actually the union of two distinct logs: the last log,
  * and the current log.  All errors seen are logged to the current log.  When a
  * scrub completes, the current log becomes the last log, the last log is thrown
  * out, and the current log is reinitialized.  This way, if an error is somehow
  * corrected, a new scrub will show that it no longer exists, and will be
  * deleted from the log when the scrub completes.
  *
  * The log is stored using a ZAP object whose key is a string form of the
  * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
  * optional 'objset:object' human-readable string describing the data.  When an
  * error is first logged, this string will be empty, indicating that no name is
  * known.  This prevents us from having to issue a potentially large amount of
  * I/O to discover the object name during an error path.  Instead, we do the
  * calculation when the data is requested, storing the result so future queries
  * will be faster.
  *
  * If the head_errlog feature is enabled, a different on-disk format is used.
  * The error log of each head dataset is stored separately in the zap object
  * and keyed by the head id. This enables listing every dataset affected in
  * userland. In order to be able to track whether an error block has been
  * modified or added to snapshots since it was marked as an error, a new tuple
  * is introduced: zbookmark_err_phys_t. It allows the storage of the birth
  * transaction group of an error block on-disk. The birth transaction group is
  * used by check_filesystem() to assess whether this block was freed,
  * re-written or added to a snapshot since its marking as an error.
  *
  * This log is then shipped into an nvlist where the key is the dataset name and
  * the value is the object name.  Userland is then responsible for uniquifying
  * this list and displaying it to the user.
  */
 
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_objset.h>
 #include <sys/dbuf.h>
 #include <sys/zfs_znode.h>
 
 #define	NAME_MAX_LEN 64
 
 typedef struct clones {
 	uint64_t clone_ds;
 	list_node_t node;
 } clones_t;
 
 /*
  * spa_upgrade_errlog_limit : A zfs module parameter that controls the number
  *		of on-disk error log entries that will be converted to the new
  *		format when enabling head_errlog. Defaults to 0 which converts
  *		all log entries.
  */
 static uint_t spa_upgrade_errlog_limit = 0;
 
 /*
  * Convert a bookmark to a string.
  */
 static void
 bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
 }
 
 /*
  * Convert an err_phys to a string.
  */
 static void
 errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level,
 	    (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth);
 }
 
 /*
  * Convert a string to a err_phys.
  */
 void
 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
 {
 	zep->zb_object = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zep->zb_birth = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 /*
  * Convert a string to a bookmark.
  */
 static void
 name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_object = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == ':');
 	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
 	ASSERT(*buf == '\0');
 }
 
 void
 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = dataset;
 	zb->zb_object = zep->zb_object;
 	zb->zb_level = zep->zb_level;
 	zb->zb_blkid = zep->zb_blkid;
 }
 
 static void
 name_to_object(char *buf, uint64_t *obj)
 {
 	*obj = zfs_strtonum(buf, &buf);
 	ASSERT(*buf == '\0');
 }
 
 /*
  * Retrieve the head filesystem.
  */
 static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
 {
 	dsl_dataset_t *ds;
 	int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool,
 	    dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(head_ds);
 	*head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	return (error);
 }
 
 /*
  * Log an uncorrectable error to the persistent error log.  We add it to the
  * spa's list of pending errors.  The changes are actually synced out to disk
  * during spa_errlog_sync().
  */
 void
 spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
 {
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
 	avl_tree_t *tree;
 	avl_index_t where;
 
 	/*
 	 * If we are trying to import a pool, ignore any errors, as we won't be
 	 * writing to the pool any time soon.
 	 */
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * If we have had a request to rotate the log, log it to the next list
 	 * instead of the current one.
 	 */
 	if (spa->spa_scrub_active || spa->spa_scrub_finished)
 		tree = &spa->spa_errlist_scrub;
 	else
 		tree = &spa->spa_errlist_last;
 
 	search.se_bookmark = *zb;
 	if (avl_find(tree, &search, &where) != NULL) {
 		mutex_exit(&spa->spa_errlist_lock);
 		return;
 	}
 
 	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 	new->se_bookmark = *zb;
 
 	/*
 	 * If the head_errlog feature is enabled, store the birth txg now. In
 	 * case the file is deleted before spa_errlog_sync() runs, we will not
 	 * be able to retrieve the birth txg.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		new->se_zep.zb_object = zb->zb_object;
 		new->se_zep.zb_level = zb->zb_level;
 		new->se_zep.zb_blkid = zb->zb_blkid;
 		new->se_zep.zb_birth = birth;
 	}
 
 	avl_insert(tree, new, where);
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 int
 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
     uint64_t *birth_txg)
 {
 	objset_t *os;
 	int error = dmu_objset_from_ds(ds, &os);
 	if (error != 0)
 		return (error);
 
 	dnode_t *dn;
 	blkptr_t bp;
 
 	error = dnode_hold(os, zep->zb_object, FTAG, &dn);
 	if (error != 0)
 		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL,
 	    NULL);
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);
 
 	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * This function finds the oldest affected filesystem containing an error
  * block.
  */
 int
 find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     uint64_t *top_affected_fs)
 {
 	uint64_t oldest_dsobj;
 	int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
 	    &oldest_dsobj);
 	if (error != 0)
 		return (error);
 
 	dsl_dataset_t *ds;
 	error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	*top_affected_fs =
 	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	return (0);
 }
 
 
 #ifdef _KERNEL
 /*
  * Copy the bookmark to the end of the user-space buffer which starts at
  * uaddr and has *count unused entries, and decrement *count by 1.
  */
 static int
 copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
 {
 	if (*count == 0)
 		return (SET_ERROR(ENOMEM));
 
 	*count -= 1;
 	if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
 	    sizeof (zbookmark_phys_t)) != 0)
 		return (SET_ERROR(EFAULT));
 	return (0);
 }
 
 /*
  * Each time the error block is referenced by a snapshot or clone, add a
  * zbookmark_phys_t entry to the userspace array at uaddr. The array is
  * filled from the back and the in-out parameter *count is modified to be the
  * number of unused entries at the beginning of the array. The function
  * scrub_filesystem() is modelled after this one.
  */
 static int
 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count, list_t *clones_list)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	int error = dsl_dataset_hold_obj_flags(dp, head_ds,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t latest_txg;
 	uint64_t txg_to_consider = spa->spa_syncing_txg;
 	boolean_t check_snapshot = B_TRUE;
 	error = find_birth_txg(ds, zep, &latest_txg);
 
 	/*
 	 * If find_birth_txg() errors out otherwise, let txg_to_consider be
 	 * equal to the spa's syncing txg: if check_filesystem() errors out
 	 * then affected snapshots or clones will not be checked.
 	 */
 	if (error == 0 && zep->zb_birth == latest_txg) {
 		/* Block neither free nor rewritten. */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
 		error = copyout_entry(&zb, uaddr, count);
 		if (error != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			return (error);
 		}
 		check_snapshot = B_FALSE;
 	} else if (error == 0) {
 		txg_to_consider = latest_txg;
 	}
 
 	/*
 	 * Retrieve the number of snapshots if the dataset is not a snapshot.
 	 */
 	uint64_t snap_count = 0;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 
 		error = zap_count(spa->spa_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 
 		if (error != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			return (error);
 		}
 	}
 
 	if (snap_count == 0) {
 		/* Filesystem without snapshots. */
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		return (0);
 	}
 
 	uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t),
 	    KM_SLEEP);
 
 	int aff_snap_count = 0;
 	uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones;
 
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	/* Check only snapshots created from this file system. */
 	while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
 	    snap_obj_txg <= txg_to_consider) {
 
 		error = dsl_dataset_hold_obj_flags(dp, snap_obj,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (error != 0)
 			goto out;
 
 		if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) {
 			snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 			snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		boolean_t affected = B_TRUE;
 		if (check_snapshot) {
 			uint64_t blk_txg;
 			error = find_birth_txg(ds, zep, &blk_txg);
 			affected = (error == 0 && zep->zb_birth == blk_txg);
 		}
 
 		/* Report errors in snapshots. */
 		if (affected) {
 			snap_obj_array[aff_snap_count] = snap_obj;
 			aff_snap_count++;
 
 			zbookmark_phys_t zb;
 			zep_to_zb(snap_obj, zep, &zb);
 			error = copyout_entry(&zb, uaddr, count);
 			if (error != 0) {
 				dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT,
 				    FTAG);
 				goto out;
 			}
 		}
 		snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 		snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	}
 
 	if (zap_clone == 0 || aff_snap_count == 0) {
 		error = 0;
 		goto out;
 	}
 
 	/* Check clones. */
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 
 	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = zap_attribute_alloc();
 
 	for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 
 		dsl_dataset_t *clone;
 		error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &clone);
 
 		if (error != 0)
 			break;
 
 		/*
 		 * Only clones whose origins were affected could also
 		 * have affected snapshots.
 		 */
 		boolean_t found = B_FALSE;
 		for (int i = 0; i < snap_count; i++) {
 			if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj
 			    == snap_obj_array[i])
 				found = B_TRUE;
 		}
 		dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 		if (!found)
 			continue;
 
 		clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
 		ct->clone_ds = za->za_first_integer;
 		list_insert_tail(clones_list, ct);
 	}
 
 	zap_cursor_fini(zc);
 	zap_attribute_free(za);
 	kmem_free(zc, sizeof (*zc));
 
 out:
 	kmem_free(snap_obj_array, sizeof (*snap_obj_array));
 	return (error);
 }
 
 static int
 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count)
 {
 	/*
 	 * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the
 	 * birth txg or the head filesystem of the block pointer. This may
 	 * happen e.g. when an encrypted filesystem is not mounted or when
 	 * the key is not loaded. In this case do not proceed to
 	 * check_filesystem(), instead do the accounting here.
 	 */
 	if (zep->zb_birth == 0 || head_ds == 0) {
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
 		int error = copyout_entry(&zb, uaddr, count);
 		if (error != 0) {
 			return (error);
 		}
 		return (0);
 	}
 
 	uint64_t top_affected_fs;
 	uint64_t init_count = *count;
 	int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
 	if (error == 0) {
 		clones_t *ct;
 		list_t clones_list;
 
 		list_create(&clones_list, sizeof (clones_t),
 		    offsetof(clones_t, node));
 
 		error = check_filesystem(spa, top_affected_fs, zep,
 		    uaddr, count, &clones_list);
 
 		while ((ct = list_remove_head(&clones_list)) != NULL) {
 			error = check_filesystem(spa, ct->clone_ds, zep,
 			    uaddr, count, &clones_list);
 			kmem_free(ct, sizeof (*ct));
 
 			if (error) {
 				while (!list_is_empty(&clones_list)) {
 					ct = list_remove_head(&clones_list);
 					kmem_free(ct, sizeof (*ct));
 				}
 				break;
 			}
 		}
 
 		list_destroy(&clones_list);
 	}
 	if (error == 0 && init_count == *count) {
 		/*
 		 * If we reach this point, no errors have been detected
 		 * in the checked filesystems/snapshots. Before returning mark
 		 * the error block to be removed from the error lists and logs.
 		 */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
 		spa_remove_error(spa, &zb, zep->zb_birth);
 	}
 
 	return (error);
 }
 #endif
 
 /* Return the number of errors in the error log */
 uint64_t
 spa_get_last_errlog_size(spa_t *spa)
 {
 	uint64_t total = 0, count;
 	mutex_enter(&spa->spa_errlog_lock);
 
 	if (spa->spa_errlog_last != 0 &&
 	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
 	    &count) == 0)
 		total += count;
 	mutex_exit(&spa->spa_errlog_lock);
 	return (total);
 }
 
 /*
  * If a healed bookmark matches an entry in the error log we stash it in a tree
  * so that we can later remove the related log entries in sync context.
  */
 static void
 spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
     const uint64_t birth)
 {
 	char name[NAME_MAX_LEN];
 
 	if (obj == 0)
 		return;
 
 	boolean_t held_list = B_FALSE;
 	boolean_t held_log = B_FALSE;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		bookmark_to_name(healed_zb, name, sizeof (name));
 
 		if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset,
 		    name) == 0) {
 			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
 				mutex_enter(&spa->spa_errlog_lock);
 				held_log = B_TRUE;
 			}
 
 			/*
 			 * Found an error matching healed zb, add zb to our
 			 * tree of healed errors
 			 */
 			avl_tree_t *tree = &spa->spa_errlist_healed;
 			spa_error_entry_t search;
 			spa_error_entry_t *new;
 			avl_index_t where;
 			search.se_bookmark = *healed_zb;
 			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
 				mutex_enter(&spa->spa_errlist_lock);
 				held_list = B_TRUE;
 			}
 			if (avl_find(tree, &search, &where) != NULL) {
 				if (held_list)
 					mutex_exit(&spa->spa_errlist_lock);
 				if (held_log)
 					mutex_exit(&spa->spa_errlog_lock);
 				return;
 			}
 			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 			new->se_bookmark = *healed_zb;
 			avl_insert(tree, new, where);
 			if (held_list)
 				mutex_exit(&spa->spa_errlist_lock);
 			if (held_log)
 				mutex_exit(&spa->spa_errlog_lock);
 		}
 		return;
 	}
 
 	zbookmark_err_phys_t healed_zep;
 	healed_zep.zb_object = healed_zb->zb_object;
 	healed_zep.zb_level = healed_zb->zb_level;
 	healed_zep.zb_blkid = healed_zb->zb_blkid;
 	healed_zep.zb_birth = birth;
 
 	errphys_to_name(&healed_zep, name, sizeof (name));
 
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last);
 	    zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) {
 		if (zap_contains(spa->spa_meta_objset, za->za_first_integer,
 		    name) == 0) {
 			if (!MUTEX_HELD(&spa->spa_errlog_lock)) {
 				mutex_enter(&spa->spa_errlog_lock);
 				held_log = B_TRUE;
 			}
 
 			avl_tree_t *tree = &spa->spa_errlist_healed;
 			spa_error_entry_t search;
 			spa_error_entry_t *new;
 			avl_index_t where;
 			search.se_bookmark = *healed_zb;
 
 			if (!MUTEX_HELD(&spa->spa_errlist_lock)) {
 				mutex_enter(&spa->spa_errlist_lock);
 				held_list = B_TRUE;
 			}
 
 			if (avl_find(tree, &search, &where) != NULL) {
 				if (held_list)
 					mutex_exit(&spa->spa_errlist_lock);
 				if (held_log)
 					mutex_exit(&spa->spa_errlog_lock);
 				continue;
 			}
 			new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
 			new->se_bookmark = *healed_zb;
 			new->se_zep = healed_zep;
 			avl_insert(tree, new, where);
 
 			if (held_list)
 				mutex_exit(&spa->spa_errlist_lock);
 			if (held_log)
 				mutex_exit(&spa->spa_errlog_lock);
 		}
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 /*
  * If this error exists in the given tree remove it.
  */
 static void
 remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb)
 {
 	spa_error_entry_t search, *found;
 	avl_index_t where;
 
 	mutex_enter(&spa->spa_errlist_lock);
 	search.se_bookmark = *zb;
 	if ((found = avl_find(t, &search, &where)) != NULL) {
 		avl_remove(t, found);
 		kmem_free(found, sizeof (spa_error_entry_t));
 	}
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 
 /*
  * Removes all of the recv healed errors from both on-disk error logs
  */
 static void
 spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
 {
 	char name[NAME_MAX_LEN];
 	spa_error_entry_t *se;
 	void *cookie = NULL;
 
 	ASSERT(MUTEX_HELD(&spa->spa_errlog_lock));
 
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_healed,
 	    &cookie)) != NULL) {
 		remove_error_from_list(spa, s, &se->se_bookmark);
 		remove_error_from_list(spa, l, &se->se_bookmark);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 			bookmark_to_name(&se->se_bookmark, name, sizeof (name));
 			(void) zap_remove(spa->spa_meta_objset,
 			    spa->spa_errlog_last, name, tx);
 			(void) zap_remove(spa->spa_meta_objset,
 			    spa->spa_errlog_scrub, name, tx);
 		} else {
 			errphys_to_name(&se->se_zep, name, sizeof (name));
 			zap_cursor_t zc;
 			zap_attribute_t *za = zap_attribute_alloc();
 			for (zap_cursor_init(&zc, spa->spa_meta_objset,
 			    spa->spa_errlog_last);
 			    zap_cursor_retrieve(&zc, za) == 0;
 			    zap_cursor_advance(&zc)) {
 				zap_remove(spa->spa_meta_objset,
 				    za->za_first_integer, name, tx);
 			}
 			zap_cursor_fini(&zc);
 
 			for (zap_cursor_init(&zc, spa->spa_meta_objset,
 			    spa->spa_errlog_scrub);
 			    zap_cursor_retrieve(&zc, za) == 0;
 			    zap_cursor_advance(&zc)) {
 				zap_remove(spa->spa_meta_objset,
 				    za->za_first_integer, name, tx);
 			}
 			zap_cursor_fini(&zc);
 			zap_attribute_free(za);
 		}
 		kmem_free(se, sizeof (spa_error_entry_t));
 	}
 }
 
 /*
  * Stash away healed bookmarks to remove them from the on-disk error logs
  * later in spa_remove_healed_errors().
  */
 void
 spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
 {
 	spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
 	spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
 }
 
 static uint64_t
 approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
 {
 	if (spa_err_obj == 0)
 		return (0);
 	uint64_t total = 0;
 
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) {
 		uint64_t count;
 		if (zap_count(spa->spa_meta_objset, za->za_first_integer,
 		    &count) == 0)
 			total += count;
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (total);
 }
 
 /*
  * Return the approximate number of errors currently in the error log.  This
  * will be nonzero if there are some errors, but otherwise it may be more
  * or less than the number of entries returned by spa_get_errlog().
  */
 uint64_t
 spa_approx_errlog_size(spa_t *spa)
 {
 	uint64_t total = 0;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		mutex_enter(&spa->spa_errlog_lock);
 		uint64_t count;
 		if (spa->spa_errlog_scrub != 0 &&
 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
 		    &count) == 0)
 			total += count;
 
 		if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
 		    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
 		    &count) == 0)
 			total += count;
 		mutex_exit(&spa->spa_errlog_lock);
 
 	} else {
 		mutex_enter(&spa->spa_errlog_lock);
 		total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
 		total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
 		mutex_exit(&spa->spa_errlog_lock);
 	}
 	mutex_enter(&spa->spa_errlist_lock);
 	total += avl_numnodes(&spa->spa_errlist_last);
 	total += avl_numnodes(&spa->spa_errlist_scrub);
 	mutex_exit(&spa->spa_errlist_lock);
 	return (total);
 }
 
 /*
  * This function sweeps through an on-disk error log and stores all bookmarks
  * as error bookmarks in a new ZAP object. At the end we discard the old one,
  * and spa_update_errlog() will set the spa's on-disk error log to new ZAP
  * object.
  */
 static void
 sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
     dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	zbookmark_phys_t zb;
 	uint64_t count;
 
 	*newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
 	    DMU_OT_NONE, 0, tx);
 
 	/*
 	 * If we cannnot perform the upgrade we should clear the old on-disk
 	 * error logs.
 	 */
 	if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) {
 		VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 		return;
 	}
 
 	za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		if (spa_upgrade_errlog_limit != 0 &&
 		    zc.zc_cd == spa_upgrade_errlog_limit)
 			break;
 
 		name_to_bookmark(za->za_name, &zb);
 
 		zbookmark_err_phys_t zep;
 		zep.zb_object = zb.zb_object;
 		zep.zb_level = zb.zb_level;
 		zep.zb_blkid = zb.zb_blkid;
 		zep.zb_birth = 0;
 
 		/*
 		 * In case of an error we should simply continue instead of
 		 * returning prematurely. See the next comment.
 		 */
 		uint64_t head_ds;
 		dsl_pool_t *dp = spa->spa_dsl_pool;
 		dsl_dataset_t *ds;
 		objset_t *os;
 
 		int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 		if (error != 0)
 			continue;
 
 		head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
 
 		/*
 		 * The objset and the dnode are required for getting the block
 		 * pointer, which is used to determine if BP_IS_HOLE(). If
 		 * getting the objset or the dnode fails, do not create a
 		 * zap entry (presuming we know the dataset) as this may create
 		 * spurious errors that we cannot ever resolve. If an error is
 		 * truly persistent, it should re-appear after a scan.
 		 */
 		if (dmu_objset_from_ds(ds, &os) != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		dnode_t *dn;
 		blkptr_t bp;
 
 		if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) {
 			dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 			continue;
 		}
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp,
 		    NULL, NULL);
 		if (error == EACCES)
 			error = 0;
 		else if (!error)
 			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
 
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 
 		if (error != 0 || BP_IS_HOLE(&bp))
 			continue;
 
 		uint64_t err_obj;
 		error = zap_lookup_int_key(spa->spa_meta_objset, *newobj,
 		    head_ds, &err_obj);
 
 		if (error == ENOENT) {
 			err_obj = zap_create(spa->spa_meta_objset,
 			    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 			(void) zap_update_int_key(spa->spa_meta_objset,
 			    *newobj, head_ds, err_obj, tx);
 		}
 
 		char buf[64];
 		errphys_to_name(&zep, buf, sizeof (buf));
 
 		const char *name = "";
 		(void) zap_update(spa->spa_meta_objset, err_obj,
 		    buf, 1, strlen(name) + 1, name, tx);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 
 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 }
 
 void
 spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t newobj = 0;
 
 	mutex_enter(&spa->spa_errlog_lock);
 	if (spa->spa_errlog_last != 0) {
 		sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx);
 		spa->spa_errlog_last = newobj;
 
 		(void) zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
 		    sizeof (uint64_t), 1, &spa->spa_errlog_last, tx);
 	}
 
 	if (spa->spa_errlog_scrub != 0) {
 		sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx);
 		spa->spa_errlog_scrub = newobj;
 
 		(void) zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
 		    sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx);
 	}
 
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 #ifdef _KERNEL
 /*
  * If an error block is shared by two datasets it will be counted twice.
  */
 static int
 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
 {
 	if (obj == 0)
 		return (0);
 
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 
 	zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = zap_attribute_alloc();
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
 		    zap_cursor_retrieve(zc, za) == 0;
 		    zap_cursor_advance(zc)) {
 			if (*count == 0) {
 				zap_cursor_fini(zc);
 				kmem_free(zc, sizeof (*zc));
 				zap_attribute_free(za);
 				return (SET_ERROR(ENOMEM));
 			}
 
 			zbookmark_phys_t zb;
 			name_to_bookmark(za->za_name, &zb);
 
 			int error = copyout_entry(&zb, uaddr, count);
 			if (error != 0) {
 				zap_cursor_fini(zc);
 				kmem_free(zc, sizeof (*zc));
 				zap_attribute_free(za);
 				return (error);
 			}
 		}
 		zap_cursor_fini(zc);
 		kmem_free(zc, sizeof (*zc));
 		zap_attribute_free(za);
 		return (0);
 	}
 
 	for (zap_cursor_init(zc, spa->spa_meta_objset, obj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 
 		zap_cursor_t *head_ds_cursor;
 		zap_attribute_t *head_ds_attr;
 
 		head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
 		head_ds_attr = zap_attribute_alloc();
 
 		uint64_t head_ds_err_obj = za->za_first_integer;
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
 		    head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
 		    head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
 
 			zbookmark_err_phys_t head_ds_block;
 			name_to_errphys(head_ds_attr->za_name, &head_ds_block);
 			int error = process_error_block(spa, head_ds,
 			    &head_ds_block, uaddr, count);
 
 			if (error != 0) {
 				zap_cursor_fini(head_ds_cursor);
 				kmem_free(head_ds_cursor,
 				    sizeof (*head_ds_cursor));
 				zap_attribute_free(head_ds_attr);
 
 				zap_cursor_fini(zc);
 				zap_attribute_free(za);
 				kmem_free(zc, sizeof (*zc));
 				return (error);
 			}
 		}
 		zap_cursor_fini(head_ds_cursor);
 		kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
 		zap_attribute_free(head_ds_attr);
 	}
 	zap_cursor_fini(zc);
 	zap_attribute_free(za);
 	kmem_free(zc, sizeof (*zc));
 	return (0);
 }
 
 static int
 process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
 {
 	spa_error_entry_t *se;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (se = avl_first(list); se != NULL;
 		    se = AVL_NEXT(list, se)) {
 			int error =
 			    copyout_entry(&se->se_bookmark, uaddr, count);
 			if (error != 0) {
 				return (error);
 			}
 		}
 		return (0);
 	}
 
 	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
 		uint64_t head_ds = 0;
 		int error = get_head_ds(spa, se->se_bookmark.zb_objset,
 		    &head_ds);
 
 		/*
 		 * If get_head_ds() errors out, set the head filesystem
 		 * to the filesystem stored in the bookmark of the
 		 * error block.
 		 */
 		if (error != 0)
 			head_ds = se->se_bookmark.zb_objset;
 
 		error = process_error_block(spa, head_ds,
 		    &se->se_zep, uaddr, count);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 #endif
 
 /*
  * Copy all known errors to userland as an array of bookmarks.  This is
  * actually a union of the on-disk last log and current log, as well as any
  * pending error requests.
  *
  * Because the act of reading the on-disk log could cause errors to be
  * generated, we have two separate locks: one for the error log and one for the
  * in-core error lists.  We only need the error list lock to log and error, so
  * we grab the error log lock while we read the on-disk logs, and only pick up
  * the error list lock when we are finished.
  */
 int
 spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
 {
 	int ret = 0;
 
 #ifdef _KERNEL
 	/*
 	 * The pool config lock is needed to hold a dataset_t via (among other
 	 * places) process_error_list() -> process_error_block()->
 	 * find_top_affected_fs(), and lock ordering requires that we get it
 	 * before the spa_errlog_lock.
 	 */
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
 
 	if (!ret && !spa->spa_scrub_finished)
 		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
 		    count);
 
 	mutex_enter(&spa->spa_errlist_lock);
 	if (!ret)
 		ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr,
 		    count);
 	if (!ret)
 		ret = process_error_list(spa, &spa->spa_errlist_last, uaddr,
 		    count);
 	mutex_exit(&spa->spa_errlist_lock);
 
 	mutex_exit(&spa->spa_errlog_lock);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 #else
 	(void) spa, (void) uaddr, (void) count;
 #endif
 
 	return (ret);
 }
 
 /*
  * Called when a scrub completes.  This simply set a bit which tells which AVL
  * tree to add new errors.  spa_errlog_sync() is responsible for actually
  * syncing the changes to the underlying objects.
  */
 void
 spa_errlog_rotate(spa_t *spa)
 {
 	mutex_enter(&spa->spa_errlist_lock);
 	spa->spa_scrub_finished = B_TRUE;
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 /*
  * Discard any pending errors from the spa_t.  Called when unloading a faulted
  * pool, as the errors encountered during the open cannot be synced to disk.
  */
 void
 spa_errlog_drain(spa_t *spa)
 {
 	spa_error_entry_t *se;
 	void *cookie;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
 	    &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
 	    &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
 /*
  * Process a list of errors into the current on-disk log.
  */
 void
 sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
 {
 	spa_error_entry_t *se;
 	char buf[NAME_MAX_LEN];
 	void *cookie;
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/* create log if necessary */
 	if (*obj == 0)
 		*obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG,
 		    DMU_OT_NONE, 0, tx);
 
 	/* add errors to the current log */
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
 
 			const char *name = se->se_name ? se->se_name : "";
 			(void) zap_update(spa->spa_meta_objset, *obj, buf, 1,
 			    strlen(name) + 1, name, tx);
 		}
 	} else {
 		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
 			zbookmark_err_phys_t zep;
 			zep.zb_object = se->se_zep.zb_object;
 			zep.zb_level = se->se_zep.zb_level;
 			zep.zb_blkid = se->se_zep.zb_blkid;
 			zep.zb_birth = se->se_zep.zb_birth;
 
 			uint64_t head_ds = 0;
 			int error = get_head_ds(spa, se->se_bookmark.zb_objset,
 			    &head_ds);
 
 			/*
 			 * If get_head_ds() errors out, set the head filesystem
 			 * to the filesystem stored in the bookmark of the
 			 * error block.
 			 */
 			if (error != 0)
 				head_ds = se->se_bookmark.zb_objset;
 
 			uint64_t err_obj;
 			error = zap_lookup_int_key(spa->spa_meta_objset,
 			    *obj, head_ds, &err_obj);
 
 			if (error == ENOENT) {
 				err_obj = zap_create(spa->spa_meta_objset,
 				    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 				(void) zap_update_int_key(spa->spa_meta_objset,
 				    *obj, head_ds, err_obj, tx);
 			}
 			errphys_to_name(&zep, buf, sizeof (buf));
 
 			const char *name = se->se_name ? se->se_name : "";
 			(void) zap_update(spa->spa_meta_objset,
 			    err_obj, buf, 1, strlen(name) + 1, name, tx);
 		}
 	}
 	/* purge the error list */
 	cookie = NULL;
 	while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(se, sizeof (spa_error_entry_t));
 }
 
 static void
 delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
 			    za->za_first_integer, tx));
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 	VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx));
 }
 
 /*
  * Sync the error log out to disk.  This is a little tricky because the act of
  * writing the error log requires the spa_errlist_lock.  So, we need to lock the
  * error lists, take a copy of the lists, and then reinitialize them.  Then, we
  * drop the error list lock and take the error log lock, at which point we
  * do the errlog processing.  Then, if we encounter an I/O error during this
  * process, we can successfully add the error to the list.  Note that this will
  * result in the perpetual recycling of errors, but it is an unlikely situation
  * and not a performance critical operation.
  */
 void
 spa_errlog_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	avl_tree_t scrub, last;
 	int scrub_finished;
 
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * Bail out early under normal circumstances.
 	 */
 	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
 	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
 	    avl_numnodes(&spa->spa_errlist_healed) == 0 &&
 	    !spa->spa_scrub_finished) {
 		mutex_exit(&spa->spa_errlist_lock);
 		return;
 	}
 
 	spa_get_errlists(spa, &last, &scrub);
 	scrub_finished = spa->spa_scrub_finished;
 	spa->spa_scrub_finished = B_FALSE;
 
 	mutex_exit(&spa->spa_errlist_lock);
 
 	/*
 	 * The pool config lock is needed to hold a dataset_t via
 	 * sync_error_list() -> get_head_ds(), and lock ordering
 	 * requires that we get it before the spa_errlog_lock.
 	 */
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	mutex_enter(&spa->spa_errlog_lock);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	/*
 	 * Remove healed errors from errors.
 	 */
 	spa_remove_healed_errors(spa, &last, &scrub, tx);
 
 	/*
 	 * Sync out the current list of errors.
 	 */
 	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
 
 	/*
 	 * Rotate the log if necessary.
 	 */
 	if (scrub_finished) {
 		if (spa->spa_errlog_last != 0)
 			delete_errlog(spa, spa->spa_errlog_last, tx);
 		spa->spa_errlog_last = spa->spa_errlog_scrub;
 		spa->spa_errlog_scrub = 0;
 
 		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
 	}
 
 	/*
 	 * Sync out any pending scrub errors.
 	 */
 	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
 
 	/*
 	 * Update the MOS to reflect the new values.
 	 */
 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
 	    &spa->spa_errlog_last, tx);
 	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
 	    &spa->spa_errlog_scrub, tx);
 
 	dmu_tx_commit(tx);
 
 	mutex_exit(&spa->spa_errlog_lock);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 }
 
 static void
 delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds,
     dmu_tx_t *tx)
 {
 	if (spa_err_obj == 0)
 		return;
 
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
 	    zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) {
 		uint64_t head_ds;
 		name_to_object(za->za_name, &head_ds);
 		if (head_ds == ds) {
 			(void) zap_remove(spa->spa_meta_objset, spa_err_obj,
 			    za->za_name, tx);
 			VERIFY0(dmu_object_free(spa->spa_meta_objset,
 			    za->za_first_integer, tx));
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 void
 spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx)
 {
 	mutex_enter(&spa->spa_errlog_lock);
 	delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx);
 	delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx);
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 static int
 find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head,
     uint64_t *txg)
 {
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	int error = dsl_dataset_hold_obj_flags(dp, old_head,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
 	if (error != 0)
 		return (error);
 
 	uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 
 	while (prev_obj != 0) {
 		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 		if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj,
 		    DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 &&
 		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head)
 			break;
 
 		if (error != 0)
 			return (error);
 
 		prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 		prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	ASSERT(prev_obj != 0);
 	*txg = prev_obj_txg;
 	return (0);
 }
 
 static void
 swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t
     old_head, dmu_tx_t *tx)
 {
 	if (spa_err_obj == 0)
 		return;
 
 	uint64_t old_head_errlog;
 	int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj,
 	    old_head, &old_head_errlog);
 
 	/* If no error log, then there is nothing to do. */
 	if (error != 0)
 		return;
 
 	uint64_t txg;
 	error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg);
 	if (error != 0)
 		return;
 
 	/*
 	 * Create an error log if the file system being promoted does not
 	 * already have one.
 	 */
 	uint64_t new_head_errlog;
 	error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head,
 	    &new_head_errlog);
 
 	if (error != 0) {
 		new_head_errlog = zap_create(spa->spa_meta_objset,
 		    DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx);
 
 		(void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj,
 		    new_head, new_head_errlog, tx);
 	}
 
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	zbookmark_err_phys_t err_block;
 	for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog);
 	    zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) {
 
 		const char *name = "";
 		name_to_errphys(za->za_name, &err_block);
 		if (err_block.zb_birth < txg) {
 			(void) zap_update(spa->spa_meta_objset, new_head_errlog,
 			    za->za_name, 1, strlen(name) + 1, name, tx);
 
 			(void) zap_remove(spa->spa_meta_objset, old_head_errlog,
 			    za->za_name, tx);
 		}
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 void
 spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
     dmu_tx_t *tx)
 {
 	mutex_enter(&spa->spa_errlog_lock);
 	swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx);
 	swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx);
 	mutex_exit(&spa->spa_errlog_lock);
 }
 
 #if defined(_KERNEL)
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
 EXPORT_SYMBOL(spa_approx_errlog_size);
 EXPORT_SYMBOL(spa_get_last_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
 EXPORT_SYMBOL(spa_errlog_sync);
 EXPORT_SYMBOL(spa_get_errlists);
 EXPORT_SYMBOL(spa_delete_dataset_errlog);
 EXPORT_SYMBOL(spa_swap_errlog);
 EXPORT_SYMBOL(sync_error_list);
 EXPORT_SYMBOL(spa_upgrade_errlog);
 EXPORT_SYMBOL(find_top_affected_fs);
 EXPORT_SYMBOL(find_birth_txg);
 EXPORT_SYMBOL(zep_to_zb);
 EXPORT_SYMBOL(name_to_errphys);
 #endif
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, UINT, ZMOD_RW,
 	"Limit the number of errors which will be upgraded to the new "
 	"on-disk error log when enabling head_errlog");
-/* END CSTYLED */
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index f55218e3579b..a95152608578 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -1,1408 +1,1406 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/spa_log_spacemap.h>
 #include <sys/vdev_impl.h>
 #include <sys/zap.h>
 
 /*
  * Log Space Maps
  *
  * Log space maps are an optimization in ZFS metadata allocations for pools
  * whose workloads are primarily random-writes. Random-write workloads are also
  * typically random-free, meaning that they are freeing from locations scattered
  * throughout the pool. This means that each TXG we will have to append some
  * FREE records to almost every metaslab. With log space maps, we hold their
  * changes in memory and log them altogether in one pool-wide space map on-disk
  * for persistence. As more blocks are accumulated in the log space maps and
  * more unflushed changes are accounted in memory, we flush a selected group
  * of metaslabs every TXG to relieve memory pressure and potential overheads
  * when loading the pool. Flushing a metaslab to disk relieves memory as we
  * flush any unflushed changes from memory to disk (i.e. the metaslab's space
  * map) and saves import time by making old log space maps obsolete and
  * eventually destroying them. [A log space map is said to be obsolete when all
  * its entries have made it to their corresponding metaslab space maps].
  *
  * == On disk data structures used ==
  *
  * - The pool has a new feature flag and a new entry in the MOS. The feature
  *   is activated when we create the first log space map and remains active
  *   for the lifetime of the pool. The new entry in the MOS Directory [refer
  *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
  *   pairs are of the form <key: txg, value: log space map object for that txg>.
  *   This entry is our on-disk reference of the log space maps that exist in
  *   the pool for each TXG and it is used during import to load all the
  *   metaslab unflushed changes in memory. To see how this structure is first
  *   created and later populated refer to spa_generate_syncing_log_sm(). To see
  *   how it is used during import time refer to spa_ld_log_sm_metadata().
  *
  * - Each vdev has a new entry in its vdev_top_zap (see field
  *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
  *   each metaslab in this vdev. This field is the on-disk counterpart of the
  *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
  *   the metaslab haven't had its changes flushed. During import, we use this
  *   to ignore any entries in the space map log that are for this metaslab but
  *   from a TXG before msp_unflushed_txg. At that point, we also populate its
  *   in-memory counterpart and from there both fields are updated every time
  *   we flush that metaslab.
  *
  * - A space map is created every TXG and, during that TXG, it is used to log
  *   all incoming changes (the log space map). When created, the log space map
  *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
  *   to the space map ZAP mentioned above. The log space map is closed at the
  *   end of the TXG and will be destroyed when it becomes fully obsolete. We
  *   know when a log space map has become obsolete by looking at the oldest
  *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
  *   than the log space map's TXG, then it means that there is no metaslab who
  *   doesn't have the changes from that log and we can therefore destroy it.
  *   [see spa_cleanup_old_sm_logs()].
  *
  * == Important in-memory structures ==
  *
  * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
  *   the pool by their ms_unflushed_txg field. It is primarily used for three
  *   reasons. First of all, it is used during flushing where we try to flush
  *   metaslabs in-order from the oldest-flushed to the most recently flushed
  *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
  *   oldest flushed metaslab to distinguish which log space maps have become
  *   obsolete and which ones are still relevant. Finally it tells us which
  *   metaslabs have unflushed changes in a pool where this feature was just
  *   enabled, as we don't immediately add all of the pool's metaslabs but we
  *   add them over time as they go through metaslab_sync(). The reason that
  *   we do that is to ease these pools into the behavior of the flushing
  *   algorithm (described later on).
  *
  * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
  *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
  *   nodes represent the log space maps in the pool. This in-memory
  *   representation of log space maps in the pool sorts the log space maps by
  *   the TXG that they were created (which is also the TXG of their unflushed
  *   changes). It also contains the following extra information for each
  *   space map:
  *   [1] The number of metaslabs that were last flushed on that TXG. This is
  *       important because if that counter is zero and this is the oldest
  *       log then it means that it is also obsolete.
  *   [2] The number of blocks of that space map. This field is used by the
  *       block heuristic of our flushing algorithm (described later on).
  *       It represents how many blocks of metadata changes ZFS had to write
  *       to disk for that TXG.
  *
  * - The per-spa field spa_log_summary is a list of entries that summarizes
  *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
  *   AVL tree mentioned above. The reason this exists is that our flushing
  *   algorithm (described later) tries to estimate how many metaslabs to flush
  *   in each TXG by iterating over all the log space maps and looking at their
  *   block counts. Summarizing that information means that don't have to
  *   iterate through each space map, minimizing the runtime overhead of the
  *   flushing algorithm which would be induced in syncing context. In terms of
  *   implementation the log summary is used as a queue:
  *   * we modify or pop entries from its head when we flush metaslabs
  *   * we modify or append entries to its tail when we sync changes.
  *
  * - Each metaslab has two new range trees that hold its unflushed changes,
  *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
  *
  * == Flushing algorithm ==
  *
  * The decision of how many metaslabs to flush on a give TXG is guided by
  * two heuristics:
  *
  * [1] The memory heuristic -
  * We keep track of the memory used by the unflushed trees from all the
  * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
  * stays below a certain threshold which is determined by an arbitrary hard
  * limit and an arbitrary percentage of the system's memory [see
  * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
  * unflushed changes are passing that threshold, we flush metaslabs, which
  * empties their unflushed range trees, reducing the memory used.
  *
  * [2] The block heuristic -
  * We try to keep the total number of blocks in the log space maps in check
  * so the log doesn't grow indefinitely and we don't induce a lot of overhead
  * when loading the pool. At the same time we don't want to flush a lot of
  * metaslabs too often as this would defeat the purpose of the log space map.
  * As a result we set a limit in the amount of blocks that we think it's
  * acceptable for the log space maps to have and try not to cross it.
  * [see sus_blocklimit from spa_unflushed_stats].
  *
  * In order to stay below the block limit every TXG we have to estimate how
  * many metaslabs we need to flush based on the current rate of incoming blocks
  * and our history of log space map blocks. The main idea here is to answer
  * the question of how many metaslabs do we need to flush in order to get rid
  * at least an X amount of log space map blocks. We can answer this question
  * by iterating backwards from the oldest log space map to the newest one
  * and looking at their metaslab and block counts. At this point the log summary
  * mentioned above comes handy as it reduces the amount of things that we have
  * to iterate (even though it may reduce the preciseness of our estimates due
  * to its aggregation of data). So with that in mind, we project the incoming
  * rate of the current TXG into the future and attempt to approximate how many
  * metaslabs would we need to flush from now in order to avoid exceeding our
  * block limit in different points in the future (granted that we would keep
  * flushing the same number of metaslabs for every TXG). Then we take the
  * maximum number from all these estimates to be on the safe side. For the
  * exact implementation details of algorithm refer to
  * spa_estimate_metaslabs_to_flush.
  */
 
 /*
  * This is used as the block size for the space maps used for the
  * log space map feature. These space maps benefit from a bigger
  * block size as we expect to be writing a lot of data to them at
  * once.
  */
 static const unsigned long zfs_log_sm_blksz = 1ULL << 17;
 
 /*
  * Percentage of the overall system's memory that ZFS allows to be
  * used for unflushed changes (e.g. the sum of size of all the nodes
  * in the unflushed trees).
  *
  * Note that this value is calculated over 1000000 for finer granularity
  * (thus the _ppm suffix; reads as "parts per million"). As an example,
  * the default of 1000 allows 0.1% of memory to be used.
  */
 static uint64_t zfs_unflushed_max_mem_ppm = 1000;
 
 /*
  * Specific hard-limit in memory that ZFS allows to be used for
  * unflushed changes.
  */
 static uint64_t zfs_unflushed_max_mem_amt = 1ULL << 30;
 
 /*
  * The following tunable determines the number of blocks that can be used for
  * the log space maps. It is expressed as a percentage of the total number of
  * metaslabs in the pool (i.e. the default of 400 means that the number of log
  * blocks is capped at 4 times the number of metaslabs).
  *
  * This value exists to tune our flushing algorithm, with higher values
  * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
  * flushing metaslabs more aggressively with the upside of saving overheads
  * when loading the pool. Another factor in this tradeoff is that flushing
  * less often can potentially lead to better utilization of the metaslab space
  * map's block size as we accumulate more changes per flush.
  *
  * Given that this tunable indirectly controls the flush rate (metaslabs
  * flushed per txg) and that's why making it a percentage in terms of the
  * number of metaslabs in the pool makes sense here.
  *
  * As a rule of thumb we default this tunable to 400% based on the following:
  *
  * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
  *    it is reasonable to expect that the amount of obsolete entries changes
  *    linearly from txg to txg (e.g. the oldest log should have the most
  *    obsolete entries, and the most recent one the least). With this we could
  *    say that, at any given time, about half of the entries in the whole space
  *    map log are obsolete. Thus for every two entries for a metaslab in the
  *    log space map, only one of them is valid and actually makes it to the
  *    metaslab's space map.
  *    [factor of 2]
  * 2] Each entry in the log space map is guaranteed to be two words while
  *    entries in metaslab space maps are generally single-word.
  *    [an extra factor of 2 - 400% overall]
  * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
  *    account any consolidation of segments from the log space map to the
  *    unflushed range trees nor their history (e.g. a segment being allocated,
  *    then freed, then allocated again means 3 log space map entries but 0
  *    metaslab space map entries). Depending on the workload, we've seen ~1.8
  *    non-obsolete log space map entries per metaslab entry, for a total of
  *    ~600%. Since most of these estimates though are workload dependent, we
  *    default on 400% to be conservative.
  *
  *    Thus we could say that even in the worst
  *    case of [1] and [2], the factor should end up being 4.
  *
  * That said, regardless of the number of metaslabs in the pool we need to
  * provide upper and lower bounds for the log block limit.
  * [see zfs_unflushed_log_block_{min,max}]
  */
 static uint_t zfs_unflushed_log_block_pct = 400;
 
 /*
  * If the number of metaslabs is small and our incoming rate is high, we could
  * get into a situation that we are flushing all our metaslabs every TXG. Thus
  * we always allow at least this many log blocks.
  */
 static uint64_t zfs_unflushed_log_block_min = 1000;
 
 /*
  * If the log becomes too big, the import time of the pool can take a hit in
  * terms of performance. Thus we have a hard limit in the size of the log in
  * terms of blocks.
  */
 static uint64_t zfs_unflushed_log_block_max = (1ULL << 17);
 
 /*
  * Also we have a hard limit in the size of the log in terms of dirty TXGs.
  */
 static uint64_t zfs_unflushed_log_txg_max = 1000;
 
 /*
  * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
  * stability of the flushing algorithm (longer summary) vs its runtime overhead
  * (smaller summary is faster to traverse).
  */
 static uint64_t zfs_max_logsm_summary_length = 10;
 
 /*
  * Tunable that sets the lower bound on the metaslabs to flush every TXG.
  *
  * Setting this to 0 has no effect since if the pool is idle we won't even be
  * creating log space maps and therefore we won't be flushing. On the other
  * hand if the pool has any incoming workload our block heuristic will start
  * flushing metaslabs anyway.
  *
  * The point of this tunable is to be used in extreme cases where we really
  * want to flush more metaslabs than our adaptable heuristic plans to flush.
  */
 static uint64_t zfs_min_metaslabs_to_flush = 1;
 
 /*
  * Tunable that specifies how far in the past do we want to look when trying to
  * estimate the incoming log blocks for the current TXG.
  *
  * Setting this too high may not only increase runtime but also minimize the
  * effect of the incoming rates from the most recent TXGs as we take the
  * average over all the blocks that we walk
  * [see spa_estimate_incoming_log_blocks].
  */
 static uint64_t zfs_max_log_walking = 5;
 
 /*
  * This tunable exists solely for testing purposes. It ensures that the log
  * spacemaps are not flushed and destroyed during export in order for the
  * relevant log spacemap import code paths to be tested (effectively simulating
  * a crash).
  */
 int zfs_keep_log_spacemaps_at_export = 0;
 
 static uint64_t
 spa_estimate_incoming_log_blocks(spa_t *spa)
 {
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	uint64_t steps = 0, sum = 0;
 	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	    sls != NULL && steps < zfs_max_log_walking;
 	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
 		if (sls->sls_txg == spa_syncing_txg(spa)) {
 			/*
 			 * skip the log created in this TXG as this would
 			 * make our estimations inaccurate.
 			 */
 			continue;
 		}
 		sum += sls->sls_nblocks;
 		steps++;
 	}
 	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
 }
 
 uint64_t
 spa_log_sm_blocklimit(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_blocklimit);
 }
 
 void
 spa_log_sm_set_blocklimit(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		ASSERT0(spa_log_sm_blocklimit(spa));
 		return;
 	}
 
 	uint64_t msdcount = 0;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e))
 		msdcount += e->lse_msdcount;
 
 	uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100;
 	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit,
 	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
 }
 
 uint64_t
 spa_log_sm_nblocks(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_nblocks);
 }
 
 /*
  * Ensure that the in-memory log space map structures and the summary
  * have the same block and metaslab counts.
  */
 static void
 spa_log_summary_verify_counts(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
 		return;
 
 	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
 
 	uint64_t ms_in_summary = 0, blk_in_summary = 0;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 		ms_in_summary += e->lse_mscount;
 		blk_in_summary += e->lse_blkcount;
 	}
 
 	uint64_t ms_in_logs = 0, blk_in_logs = 0;
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		ms_in_logs += sls->sls_mscount;
 		blk_in_logs += sls->sls_nblocks;
 	}
 
 	VERIFY3U(ms_in_logs, ==, ms_in_summary);
 	VERIFY3U(ms_in_logs, ==, ms_in_avl);
 	VERIFY3U(blk_in_logs, ==, blk_in_summary);
 	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
 }
 
 static boolean_t
 summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg)
 {
 	if (e->lse_end == txg)
 		return (0);
 	if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max,
 	    zfs_max_logsm_summary_length))
 		return (1);
 	uint64_t blocks_per_row = MAX(1,
 	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
 	    zfs_max_logsm_summary_length));
 	return (blocks_per_row <= e->lse_blkcount);
 }
 
 /*
  * Update the log summary information to reflect the fact that a metaslab
  * was flushed or destroyed (e.g due to device removal or pool export/destroy).
  *
  * We typically flush the oldest flushed metaslab so the first (and oldest)
  * entry of the summary is updated. However if that metaslab is getting loaded
  * we may flush the second oldest one which may be part of an entry later in
  * the summary. Moreover, if we call into this function from metaslab_fini()
  * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
  * for a txg as an argument so we can locate the appropriate summary entry for
  * the metaslab.
  */
 void
 spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty)
 {
 	/*
 	 * We don't track summary data for read-only pools and this function
 	 * can be called from metaslab_fini(). In that case return immediately.
 	 */
 	if (!spa_writeable(spa))
 		return;
 
 	log_summary_entry_t *target = NULL;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 		if (e->lse_start > txg)
 			break;
 		target = e;
 	}
 
 	if (target == NULL || target->lse_mscount == 0) {
 		/*
 		 * We didn't find a summary entry for this metaslab. We must be
 		 * at the teardown of a spa_load() attempt that got an error
 		 * while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	target->lse_mscount--;
 	if (dirty)
 		target->lse_msdcount--;
 }
 
 /*
  * Update the log summary information to reflect the fact that we destroyed
  * old log space maps. Since we can only destroy the oldest log space maps,
  * we decrement the block count of the oldest summary entry and potentially
  * destroy it when that count hits 0.
  *
  * This function is called after a metaslab is flushed and typically that
  * metaslab is the oldest flushed, which means that this function will
  * typically decrement the block count of the first entry of the summary and
  * potentially free it if the block count gets to zero (its metaslab count
  * should be zero too at that point).
  *
  * There are certain scenarios though that don't work exactly like that so we
  * need to account for them:
  *
  * Scenario [1]: It is possible that after we flushed the oldest flushed
  * metaslab and we destroyed the oldest log space map, more recent logs had 0
  * metaslabs pointing to them so we got rid of them too. This can happen due
  * to metaslabs being destroyed through device removal, or because the oldest
  * flushed metaslab was loading but we kept flushing more recently flushed
  * metaslabs due to the memory pressure of unflushed changes. Because of that,
  * we always iterate from the beginning of the summary and if blocks_gone is
  * bigger than the block_count of the current entry we free that entry (we
  * expect its metaslab count to be zero), we decrement blocks_gone and on to
  * the next entry repeating this procedure until blocks_gone gets decremented
  * to 0. Doing this also works for the typical case mentioned above.
  *
  * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
  * the first (and oldest) entry in the summary. If the first few entries of
  * the summary were only accounting metaslabs from a device that was just
  * removed, then the current oldest flushed metaslab could be accounted by an
  * entry somewhere in the middle of the summary. Moreover flushing that
  * metaslab will destroy all the log space maps older than its ms_unflushed_txg
  * because they became obsolete after the removal. Thus, iterating as we did
  * for scenario [1] works out for this case too.
  *
  * Scenario [3]: At times we decide to flush all the metaslabs in the pool
  * in one TXG (either because we are exporting the pool or because our flushing
  * heuristics decided to do so). When that happens all the log space maps get
  * destroyed except the one created for the current TXG which doesn't have
  * any log blocks yet. As log space maps get destroyed with every metaslab that
  * we flush, entries in the summary are also destroyed. This brings a weird
  * corner-case when we flush the last metaslab and the log space map of the
  * current TXG is in the same summary entry with other log space maps that
  * are older. When that happens we are eventually left with this one last
  * summary entry whose blocks are gone (blocks_gone equals the entry's block
  * count) but its metaslab count is non-zero (because it accounts all the
  * metaslabs in the pool as they all got flushed). Under this scenario we can't
  * free this last summary entry as it's referencing all the metaslabs in the
  * pool and its block count will get incremented at the end of this sync (when
  * we close the syncing log space map). Thus we just decrement its current
  * block count and leave it alone. In the case that the pool gets exported,
  * its metaslab count will be decremented over time as we call metaslab_fini()
  * for all the metaslabs in the pool and the entry will be freed at
  * spa_unload_log_sm_metadata().
  */
 void
 spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
 {
 	log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	ASSERT3P(e, !=, NULL);
 	if (e->lse_txgcount > 0)
 		e->lse_txgcount--;
 	for (; e != NULL; e = list_head(&spa->spa_log_summary)) {
 		if (e->lse_blkcount > blocks_gone) {
 			e->lse_blkcount -= blocks_gone;
 			blocks_gone = 0;
 			break;
 		} else if (e->lse_mscount == 0) {
 			/* remove obsolete entry */
 			blocks_gone -= e->lse_blkcount;
 			list_remove(&spa->spa_log_summary, e);
 			kmem_free(e, sizeof (log_summary_entry_t));
 		} else {
 			/* Verify that this is scenario [3] mentioned above. */
 			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
 
 			/*
 			 * Assert that this is scenario [3] further by ensuring
 			 * that this is the only entry in the summary.
 			 */
 			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
 			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
 
 			blocks_gone = e->lse_blkcount = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Ensure that there is no way we are trying to remove more blocks
 	 * than the # of blocks in the summary.
 	 */
 	ASSERT0(blocks_gone);
 }
 
 void
 spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
 {
 	spa_log_sm_t target = { .sls_txg = txg };
 	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 	    &target, NULL);
 
 	if (sls == NULL) {
 		/*
 		 * We must be at the teardown of a spa_load() attempt that
 		 * got an error while reading the log space maps.
 		 */
 		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
 		return;
 	}
 
 	ASSERT(sls->sls_mscount > 0);
 	sls->sls_mscount--;
 }
 
 void
 spa_log_sm_increment_current_mscount(spa_t *spa)
 {
 	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
 	last_sls->sls_mscount++;
 }
 
 static void
 summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
     uint64_t metaslabs_dirty, uint64_t nblocks)
 {
 	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
 
 	if (e == NULL || summary_entry_is_full(spa, e, txg)) {
 		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
 		e->lse_start = e->lse_end = txg;
 		e->lse_txgcount = 1;
 		list_insert_tail(&spa->spa_log_summary, e);
 	}
 
 	ASSERT3U(e->lse_start, <=, txg);
 	if (e->lse_end < txg) {
 		e->lse_end = txg;
 		e->lse_txgcount++;
 	}
 	e->lse_mscount += metaslabs_flushed;
 	e->lse_msdcount += metaslabs_dirty;
 	e->lse_blkcount += nblocks;
 }
 
 static void
 spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks);
 }
 
 void
 spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty)
 {
 	summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0);
 }
 
 void
 spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg)
 {
 	log_summary_entry_t *target = NULL;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
 		if (e->lse_start > txg)
 			break;
 		target = e;
 	}
 	ASSERT3P(target, !=, NULL);
 	ASSERT3U(target->lse_mscount, !=, 0);
 	target->lse_msdcount++;
 }
 
 /*
  * This function attempts to estimate how many metaslabs should
  * we flush to satisfy our block heuristic for the log spacemap
  * for the upcoming TXGs.
  *
  * Specifically, it first tries to estimate the number of incoming
  * blocks in this TXG. Then by projecting that incoming rate to
  * future TXGs and using the log summary, it figures out how many
  * flushes we would need to do for future TXGs individually to
  * stay below our block limit and returns the maximum number of
  * flushes from those estimates.
  */
 static uint64_t
 spa_estimate_metaslabs_to_flush(spa_t *spa)
 {
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_log_sm_blocklimit(spa) != 0);
 
 	/*
 	 * This variable contains the incoming rate that will be projected
 	 * and used for our flushing estimates in the future.
 	 */
 	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
 
 	/*
 	 * At any point in time this variable tells us how many
 	 * TXGs in the future we are so we can make our estimations.
 	 */
 	uint64_t txgs_in_future = 1;
 
 	/*
 	 * This variable tells us how much room do we have until we hit
 	 * our limit. When it goes negative, it means that we've exceeded
 	 * our limit and we need to flush.
 	 *
 	 * Note that since we start at the first TXG in the future (i.e.
 	 * txgs_in_future starts from 1) we already decrement this
 	 * variable by the incoming rate.
 	 */
 	int64_t available_blocks =
 	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
 
 	int64_t available_txgs = zfs_unflushed_log_txg_max;
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e))
 		available_txgs -= e->lse_txgcount;
 
 	/*
 	 * This variable tells us the total number of flushes needed to
 	 * keep the log size within the limit when we reach txgs_in_future.
 	 */
 	uint64_t total_flushes = 0;
 
 	/* Holds the current maximum of our estimates so far. */
 	uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush;
 
 	/*
 	 * For our estimations we only look as far in the future
 	 * as the summary allows us.
 	 */
 	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
 	    e; e = list_next(&spa->spa_log_summary, e)) {
 
 		/*
 		 * If there is still room before we exceed our limit
 		 * then keep skipping TXGs accumulating more blocks
 		 * based on the incoming rate until we exceed it.
 		 */
 		if (available_blocks >= 0 && available_txgs >= 0) {
 			uint64_t skip_txgs = (incoming == 0) ?
 			    available_txgs + 1 : MIN(available_txgs + 1,
 			    (available_blocks / incoming) + 1);
 			available_blocks -= (skip_txgs * incoming);
 			available_txgs -= skip_txgs;
 			txgs_in_future += skip_txgs;
 			ASSERT3S(available_blocks, >=, -incoming);
 			ASSERT3S(available_txgs, >=, -1);
 		}
 
 		/*
 		 * At this point we're far enough into the future where
 		 * the limit was just exceeded and we flush metaslabs
 		 * based on the current entry in the summary, updating
 		 * our available_blocks.
 		 */
 		ASSERT(available_blocks < 0 || available_txgs < 0);
 		available_blocks += e->lse_blkcount;
 		available_txgs += e->lse_txgcount;
 		total_flushes += e->lse_msdcount;
 
 		/*
 		 * Keep the running maximum of the total_flushes that
 		 * we've done so far over the number of TXGs in the
 		 * future that we are. The idea here is to estimate
 		 * the average number of flushes that we should do
 		 * every TXG so that when we are that many TXGs in the
 		 * future we stay under the limit.
 		 */
 		max_flushes_pertxg = MAX(max_flushes_pertxg,
 		    DIV_ROUND_UP(total_flushes, txgs_in_future));
 	}
 	return (max_flushes_pertxg);
 }
 
 uint64_t
 spa_log_sm_memused(spa_t *spa)
 {
 	return (spa->spa_unflushed_stats.sus_memused);
 }
 
 static boolean_t
 spa_log_exceeds_memlimit(spa_t *spa)
 {
 	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
 		return (B_TRUE);
 
 	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
 	    zfs_unflushed_max_mem_ppm) / 1000000;
 	if (spa_log_sm_memused(spa) > system_mem_allowed)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_flush_all_logs_requested(spa_t *spa)
 {
 	return (spa->spa_log_flushall_txg != 0);
 }
 
 void
 spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	/*
 	 * If we don't have any metaslabs with unflushed changes
 	 * return immediately.
 	 */
 	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
 		return;
 
 	/*
 	 * During SPA export we leave a few empty TXGs to go by [see
 	 * spa_final_dirty_txg() to understand why]. For this specific
 	 * case, it is important to not flush any metaslabs as that
 	 * would dirty this TXG.
 	 *
 	 * That said, during one of these dirty TXGs that is less or
 	 * equal to spa_final_dirty(), spa_unload() will request that
 	 * we try to flush all the metaslabs for that TXG before
 	 * exporting the pool, thus we ensure that we didn't get a
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
 	if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
 
 	/*
 	 * We need to generate a log space map before flushing because this
 	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
 	 * for this TXG's flushed metaslab count (aka sls_mscount which is
 	 * manipulated in many ways down the metaslab_flush() codepath).
 	 *
 	 * That is not to say that we may generate a log space map when we
 	 * don't need it. If we are flushing metaslabs, that means that we
 	 * were going to write changes to disk anyway, so even if we were
 	 * not flushing, a log space map would have been created anyway in
 	 * metaslab_sync().
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	/*
 	 * This variable tells us how many metaslabs we want to flush based
 	 * on the block-heuristic of our flushing algorithm (see block comment
 	 * of log space map feature). We also decrement this as we flush
 	 * metaslabs and attempt to destroy old log space maps.
 	 */
 	uint64_t want_to_flush;
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		want_to_flush = UINT64_MAX;
 	} else {
 		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
 	}
 
 	/* Used purely for verification purposes */
 	uint64_t visited = 0;
 
 	/*
 	 * Ideally we would only iterate through spa_metaslabs_by_flushed
 	 * using only one variable (curr). We can't do that because
 	 * metaslab_flush() mutates position of curr in the AVL when
 	 * it flushes that metaslab by moving it to the end of the tree.
 	 * Thus we always keep track of the original next node of the
 	 * current node (curr) in another variable (next).
 	 */
 	metaslab_t *next = NULL;
 	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
 	    curr != NULL; curr = next) {
 		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
 
 		/*
 		 * If this metaslab has been flushed this txg then we've done
 		 * a full circle over the metaslabs.
 		 */
 		if (metaslab_unflushed_txg(curr) == txg)
 			break;
 
 		/*
 		 * If we are done flushing for the block heuristic and the
 		 * unflushed changes don't exceed the memory limit just stop.
 		 */
 		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
 			break;
 
 		if (metaslab_unflushed_dirty(curr)) {
 			mutex_enter(&curr->ms_sync_lock);
 			mutex_enter(&curr->ms_lock);
 			metaslab_flush(curr, tx);
 			mutex_exit(&curr->ms_lock);
 			mutex_exit(&curr->ms_sync_lock);
 			if (want_to_flush > 0)
 				want_to_flush--;
 		} else
 			metaslab_unflushed_bump(curr, tx, B_FALSE);
 
 		visited++;
 	}
 	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
 
 	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
  * Close the log space map for this TXG and update the block counts
  * for the log's in-memory structure and the summary.
  */
 void
 spa_sync_close_syncing_log_sm(spa_t *spa)
 {
 	if (spa_syncing_log_sm(spa) == NULL)
 		return;
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
 	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
 
 	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
 	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 
 	/*
 	 * Note that we can't assert that sls_mscount is not 0,
 	 * because there is the case where the first metaslab
 	 * in spa_metaslabs_by_flushed is loading and we were
 	 * not able to flush any metaslabs the current TXG.
 	 */
 	ASSERT(sls->sls_nblocks != 0);
 
 	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
 	spa_log_summary_verify_counts(spa);
 
 	space_map_close(spa->spa_syncing_log_sm);
 	spa->spa_syncing_log_sm = NULL;
 
 	/*
 	 * At this point we tried to flush as many metaslabs as we
 	 * can as the pool is getting exported. Reset the "flush all"
 	 * so the last few TXGs before closing the pool can be empty
 	 * (e.g. not dirty).
 	 */
 	if (spa_flush_all_logs_requested(spa)) {
 		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
 		spa->spa_log_flushall_txg = 0;
 	}
 }
 
 void
 spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(spa);
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 		return;
 	}
 	VERIFY0(error);
 
 	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
 	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
 
 	/* Free all log space maps older than the oldest_flushed_txg. */
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls && sls->sls_txg < oldest_flushed_txg;
 	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
 		ASSERT0(sls->sls_mscount);
 		avl_remove(&spa->spa_sm_logs_by_txg, sls);
 		space_map_free_obj(mos, sls->sls_sm_obj, tx);
 		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
 		spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks);
 		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 }
 
 static spa_log_sm_t *
 spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
 {
 	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
 	sls->sls_sm_obj = sm_obj;
 	sls->sls_txg = txg;
 	return (sls);
 }
 
 void
 spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (spa_syncing_log_sm(spa) != NULL)
 		return;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	uint64_t spacemap_zap;
 	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 		error = 0;
 		spacemap_zap = zap_create(mos,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
 		    &spacemap_zap, tx));
 		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
 	}
 	VERIFY0(error);
 
 	uint64_t sm_obj;
 	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
 	    ==, ENOENT);
 	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
 	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
 	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
 
 	/*
 	 * We pass UINT64_MAX as the space map's representation size
 	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
 	 * accept any sorts of segments since there's no real advantage
 	 * to being more restrictive (given that we're already going
 	 * to be using 2-word entries).
 	 */
 	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
 	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 	spa_log_sm_set_blocklimit(spa);
 }
 
 /*
  * Find all the log space maps stored in the space map ZAP and sort
  * them by their TXG in spa_sm_logs_by_txg.
  */
 static int
 spa_ld_log_sm_metadata(spa_t *spa)
 {
 	int error;
 	uint64_t spacemap_zap;
 
 	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
 
 	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT) {
 		/* the space map ZAP doesn't exist yet */
 		return (0);
 	} else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
 		    error);
 		return (error);
 	}
 
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
 	    (error = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t log_txg = zfs_strtonum(za->za_name, NULL);
 		spa_log_sm_t *sls =
 		    spa_log_sm_alloc(za->za_first_integer, log_txg);
 		avl_add(&spa->spa_sm_logs_by_txg, sls);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	if (error != ENOENT) {
 		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
 		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
 		    error);
 		return (error);
 	}
 
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
 		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
 		    &target, NULL);
 
 		/*
 		 * At this point if sls is zero it means that a bug occurred
 		 * in ZFS the last time the pool was open or earlier in the
 		 * import code path. In general, we would have placed a
 		 * VERIFY() here or in this case just let the kernel panic
 		 * with NULL pointer dereference when incrementing sls_mscount,
 		 * but since this is the import code path we can be a bit more
 		 * lenient. Thus, for DEBUG bits we always cause a panic, while
 		 * in production we log the error and just fail the import.
 		 */
 		ASSERT(sls != NULL);
 		if (sls == NULL) {
 			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
 			    "encountered: could not find log spacemap for "
 			    "TXG %llu [error %d]",
 			    (u_longlong_t)metaslab_unflushed_txg(m), ENOENT);
 			return (ENOENT);
 		}
 		sls->sls_mscount++;
 	}
 
 	return (0);
 }
 
 typedef struct spa_ld_log_sm_arg {
 	spa_t *slls_spa;
 	uint64_t slls_txg;
 } spa_ld_log_sm_arg_t;
 
 static int
 spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
 {
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint32_t vdev_id = sme->sme_vdev;
 
 	spa_ld_log_sm_arg_t *slls = arg;
 	spa_t *spa = slls->slls_spa;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/*
 	 * If the vdev has been removed (i.e. it is indirect or a hole)
 	 * skip this entry. The contents of this vdev have already moved
 	 * elsewhere.
 	 */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(!ms->ms_loaded);
 
 	/*
 	 * If we have already flushed entries for this TXG to this
 	 * metaslab's space map, then ignore it. Note that we flush
 	 * before processing any allocations/frees for that TXG, so
 	 * the metaslab's space map only has entries from *before*
 	 * the unflushed TXG.
 	 */
 	if (slls->slls_txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	switch (sme->sme_type) {
 	case SM_ALLOC:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
 		break;
 	case SM_FREE:
 		range_tree_remove_xor_add_segment(offset, offset + size,
 		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
 		break;
 	default:
 		panic("invalid maptype_t");
 		break;
 	}
 	if (!metaslab_unflushed_dirty(ms)) {
 		metaslab_set_unflushed_dirty(ms, B_TRUE);
 		spa_log_summary_dirty_flushed_metaslab(spa,
 		    metaslab_unflushed_txg(ms));
 	}
 	return (0);
 }
 
 static int
 spa_ld_log_sm_data(spa_t *spa)
 {
 	spa_log_sm_t *sls, *psls;
 	int error = 0;
 
 	/*
 	 * If we are not going to do any writes there is no need
 	 * to read the log space maps.
 	 */
 	if (!spa_writeable(spa))
 		return (0);
 
 	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
 	ASSERT0(spa->spa_unflushed_stats.sus_memused);
 
 	hrtime_t read_logs_starttime = gethrtime();
 
 	/* Prefetch log spacemaps dnodes. */
 	for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
 	    sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	uint_t pn = 0;
 	uint64_t ps = 0;
 	uint64_t nsm = 0;
 	psls = sls = avl_first(&spa->spa_sm_logs_by_txg);
 	while (sls != NULL) {
 		/* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */
 		if (psls != NULL && pn < 16 &&
 		    (pn < 2 || ps < 2 * dmu_prefetch_max)) {
 			error = space_map_open(&psls->sls_sm,
 			    spa_meta_objset(spa), psls->sls_sm_obj, 0,
 			    UINT64_MAX, SPA_MINBLOCKSHIFT);
 			if (error != 0) {
 				spa_load_failed(spa, "spa_ld_log_sm_data(): "
 				    "failed at space_map_open(obj=%llu) "
 				    "[error %d]",
 				    (u_longlong_t)sls->sls_sm_obj, error);
 				goto out;
 			}
 			dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj,
 			    0, 0, space_map_length(psls->sls_sm),
 			    ZIO_PRIORITY_ASYNC_READ);
 			pn++;
 			ps += space_map_length(psls->sls_sm);
 			psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls);
 			continue;
 		}
 
 		/* Load TXG log spacemap into ms_unflushed_allocs/frees. */
 		kpreempt(KPREEMPT_SYNC);
 		ASSERT0(sls->sls_nblocks);
 		sls->sls_nblocks = space_map_nblocks(sls->sls_sm);
 		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
 		summary_add_data(spa, sls->sls_txg,
 		    sls->sls_mscount, 0, sls->sls_nblocks);
 
 		spa_import_progress_set_notes_nolog(spa,
 		    "Read %llu of %lu log space maps", (u_longlong_t)nsm,
 		    avl_numnodes(&spa->spa_sm_logs_by_txg));
 
 		struct spa_ld_log_sm_arg vla = {
 			.slls_spa = spa,
 			.slls_txg = sls->sls_txg
 		};
 		error = space_map_iterate(sls->sls_sm,
 		    space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
 			    "at space_map_iterate(obj=%llu) [error %d]",
 			    (u_longlong_t)sls->sls_sm_obj, error);
 			goto out;
 		}
 
 		pn--;
 		ps -= space_map_length(sls->sls_sm);
 		nsm++;
 		space_map_close(sls->sls_sm);
 		sls->sls_sm = NULL;
 		sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls);
 
 		/* Update log block limits considering just loaded. */
 		spa_log_sm_set_blocklimit(spa);
 	}
 
 	hrtime_t read_logs_endtime = gethrtime();
 	spa_load_note(spa,
 	    "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) "
 	    "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg),
 	    (u_longlong_t)spa_log_sm_nblocks(spa),
 	    (u_longlong_t)zfs_log_sm_blksz,
 	    (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime));
 
 out:
 	if (error != 0) {
 		for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 		    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 			if (sls->sls_sm) {
 				space_map_close(sls->sls_sm);
 				sls->sls_sm = NULL;
 			}
 		}
 	} else {
 		ASSERT0(pn);
 		ASSERT0(ps);
 	}
 	/*
 	 * Now that the metaslabs contain their unflushed changes:
 	 * [1] recalculate their actual allocated space
 	 * [2] recalculate their weights
 	 * [3] sum up the memory usage of their unflushed range trees
 	 * [4] optionally load them, if debug_load is set
 	 *
 	 * Note that even in the case where we get here because of an
 	 * error (e.g. error != 0), we still want to update the fields
 	 * below in order to have a proper teardown in spa_unload().
 	 */
 	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
 	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
 		mutex_enter(&m->ms_lock);
 		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
 		    range_tree_space(m->ms_unflushed_allocs) -
 		    range_tree_space(m->ms_unflushed_frees);
 
 		vdev_t *vd = m->ms_group->mg_vd;
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
 		metaslab_space_update(vd, m->ms_group->mg_class,
 		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
 
 		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
 		metaslab_recalculate_weight_and_sort(m);
 
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(m);
 
 		if (metaslab_debug_load && m->ms_sm != NULL) {
 			VERIFY0(metaslab_load(m));
 			metaslab_set_selected_txg(m, 0);
 		}
 		mutex_exit(&m->ms_lock);
 	}
 
 	return (error);
 }
 
 static int
 spa_ld_unflushed_txgs(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	if (vd->vdev_top_zap == 0)
 		return (0);
 
 	uint64_t object = 0;
 	int error = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (uint64_t), 1, &object);
 	if (error == ENOENT)
 		return (0);
 	else if (error != 0) {
 		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
 		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
 		    (u_longlong_t)vd->vdev_top_zap, error);
 		return (error);
 	}
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		ASSERT(ms != NULL);
 
 		metaslab_unflushed_phys_t entry;
 		uint64_t entry_size = sizeof (entry);
 		uint64_t entry_offset = ms->ms_id * entry_size;
 
 		error = dmu_read(mos, object,
 		    entry_offset, entry_size, &entry, 0);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
 			    "failed at dmu_read(obj=%llu) [error %d]",
 			    (u_longlong_t)object, error);
 			return (error);
 		}
 
 		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
 		ms->ms_unflushed_dirty = B_FALSE;
 		ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs));
 		ASSERT(range_tree_is_empty(ms->ms_unflushed_frees));
 		if (ms->ms_unflushed_txg != 0) {
 			mutex_enter(&spa->spa_flushed_ms_lock);
 			avl_add(&spa->spa_metaslabs_by_flushed, ms);
 			mutex_exit(&spa->spa_flushed_ms_lock);
 		}
 	}
 	return (0);
 }
 
 /*
  * Read all the log space map entries into their respective
  * metaslab unflushed trees and keep them sorted by TXG in the
  * SPA's metadata. In addition, setup all the metadata for the
  * memory and the block heuristics.
  */
 int
 spa_ld_log_spacemaps(spa_t *spa)
 {
 	int error;
 
 	spa_log_sm_set_blocklimit(spa);
 
 	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 		error = spa_ld_unflushed_txgs(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	error = spa_ld_log_sm_metadata(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Note: we don't actually expect anything to change at this point
 	 * but we grab the config lock so we don't fail any assertions
 	 * when using vdev_lookup_top().
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	error = spa_ld_log_sm_data(spa);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	return (error);
 }
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, U64, ZMOD_RW,
 	"Specific hard-limit in memory that ZFS allows to be used for "
 	"unflushed changes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, U64, ZMOD_RW,
 	"Percentage of the overall system memory that ZFS allows to be "
 	"used for unflushed changes (value is calculated over 1000000 for "
 	"finer granularity)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, U64, ZMOD_RW,
 	"Hard limit (upper-bound) in the size of the space map log "
 	"in terms of blocks.");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, U64, ZMOD_RW,
 	"Lower-bound limit for the maximum amount of blocks allowed in "
 	"log spacemap (see zfs_unflushed_log_block_max)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, U64, ZMOD_RW,
-    "Hard limit (upper-bound) in the size of the space map log "
-    "in terms of dirty TXGs.");
+	"Hard limit (upper-bound) in the size of the space map log "
+	"in terms of dirty TXGs.");
 
 ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, UINT, ZMOD_RW,
 	"Tunable used to determine the number of blocks that can be used for "
 	"the spacemap log, expressed as a percentage of the total number of "
 	"metaslabs in the pool (e.g. 400 means the number of log blocks is "
 	"capped at 4 times the number of metaslabs)");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, U64, ZMOD_RW,
 	"The number of past TXGs that the flushing algorithm of the log "
 	"spacemap feature uses to estimate incoming log blocks");
 
 ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
 	"Prevent the log spacemaps from being flushed and destroyed "
 	"during pool export/destroy");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, U64, ZMOD_RW,
 	"Maximum number of rows allowed in the summary of the spacemap log");
 
 ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, U64, ZMOD_RW,
 	"Minimum number of metaslabs to flush per dirty TXG");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index a2b887962270..7fae51cc2c52 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -1,3179 +1,3177 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_chksum.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_raidz.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/fm/util.h>
 #include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/kstat.h>
 #include "zfs_prop.h"
 #include <sys/btree.h>
 #include <sys/zfeature.h>
 #include <sys/qat.h>
 #include <sys/zstd/zstd.h>
 
 /*
  * SPA locking
  *
  * There are three basic locks for managing spa_t structures:
  *
  * spa_namespace_lock (global mutex)
  *
  *	This lock must be acquired to do any of the following:
  *
  *		- Lookup a spa_t by name
  *		- Add or remove a spa_t from the namespace
  *		- Increase spa_refcount from non-zero
  *		- Check if spa_refcount is zero
  *		- Rename a spa_t
  *		- add/remove/attach/detach devices
  *		- Held for the duration of create/destroy
  *		- Held at the start and end of import and export
  *
  *	It does not need to handle recursion.  A create or destroy may
  *	reference objects (files or zvols) in other pools, but by
  *	definition they must have an existing reference, and will never need
  *	to lookup a spa_t by name.
  *
  * spa_refcount (per-spa zfs_refcount_t protected by mutex)
  *
  *	This reference count keep track of any active users of the spa_t.  The
  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  *	the refcount is never really 'zero' - opening a pool implicitly keeps
  *	some references in the DMU.  Internally we check against spa_minref, but
  *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock[] (per-spa array of rwlocks)
  *
  *	This protects the spa_t from config changes, and must be held in
  *	the following circumstances:
  *
  *		- RW_READER to perform I/O to the spa
  *		- RW_WRITER to change the vdev config
  *
  * The locking order is fairly straightforward:
  *
  *		spa_namespace_lock	->	spa_refcount
  *
  *	The namespace lock must be acquired to increase the refcount from 0
  *	or to check if it is zero.
  *
  *		spa_refcount		->	spa_config_lock[]
  *
  *	There must be at least one valid reference on the spa_t to acquire
  *	the config lock.
  *
  *		spa_namespace_lock	->	spa_config_lock[]
  *
  *	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock can be acquired directly and is globally visible.
  *
  * The namespace is manipulated using the following functions, all of which
  * require the spa_namespace_lock to be held.
  *
  *	spa_lookup()		Lookup a spa_t by name.
  *
  *	spa_add()		Create a new spa_t in the namespace.
  *
  *	spa_remove()		Remove a spa_t from the namespace.  This also
  *				frees up any memory associated with the spa_t.
  *
  *	spa_next()		Returns the next spa_t in the system, or the
  *				first if NULL is passed.
  *
  *	spa_evict_all()		Shutdown and remove all spa_t structures in
  *				the system.
  *
  *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
  *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
  *				called with spa_namespace_lock held if the
  *				refcount is currently zero.
  *
  *	spa_close()		Remove a reference from the spa_t.  This will
  *				not free the spa_t or remove it from the
  *				namespace.  No locking is required.
  *
  *	spa_refcount_zero()	Returns true if the refcount is currently
  *				zero.  Must be called with spa_namespace_lock
  *				held.
  *
  * The spa_config_lock[] is an array of rwlocks, ordered as follows:
  * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
  * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
  *
  * To read the configuration, it suffices to hold one of these locks as reader.
  * To modify the configuration, you must hold all locks as writer.  To modify
  * vdev state without altering the vdev tree's topology (e.g. online/offline),
  * you must hold SCL_STATE and SCL_ZIO as writer.
  *
  * We use these distinct config locks to avoid recursive lock entry.
  * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
  * block allocations (SCL_ALLOC), which may require reading space maps
  * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
  *
  * The spa config locks cannot be normal rwlocks because we need the
  * ability to hand off ownership.  For example, SCL_ZIO is acquired
  * by the issuing thread and later released by an interrupt thread.
  * They do, however, obey the usual write-wanted semantics to prevent
  * writer (i.e. system administrator) starvation.
  *
  * The lock acquisition rules are as follows:
  *
  * SCL_CONFIG
  *	Protects changes to the vdev tree topology, such as vdev
  *	add/remove/attach/detach.  Protects the dirty config list
  *	(spa_config_dirty_list) and the set of spares and l2arc devices.
  *
  * SCL_STATE
  *	Protects changes to pool state and vdev state, such as vdev
  *	online/offline/fault/degrade/clear.  Protects the dirty state list
  *	(spa_state_dirty_list) and global pool state (spa_state).
  *
  * SCL_ALLOC
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_alloc() and metaslab_claim().
  *
  * SCL_ZIO
  *	Held by bp-level zios (those which have no io_vd upon entry)
  *	to prevent changes to the vdev tree.  The bp-level zio implicitly
  *	protects all of its vdev child zios, which do not hold SCL_ZIO.
  *
  * SCL_FREE
  *	Protects changes to metaslab groups and classes.
  *	Held as reader by metaslab_free().  SCL_FREE is distinct from
  *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
  *	blocks in zio_done() while another i/o that holds either
  *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
  *
  * SCL_VDEV
  *	Held as reader to prevent changes to the vdev tree during trivial
  *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *	other locks, and lower than all of them, to ensure that it's safe
  *	to acquire regardless of caller context.
  *
  * In addition, the following rules apply:
  *
  * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
  *	The lock ordering is SCL_CONFIG > spa_props_lock.
  *
  * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
  *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
  *	or zio_write_phys() -- the caller must ensure that the config cannot
  *	cannot change in the interim, and that the vdev cannot be reopened.
  *	SCL_STATE as reader suffices for both.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
  *	spa_vdev_enter()	Acquire the namespace lock and the config lock
  *				for writing.
  *
  *	spa_vdev_exit()		Release the config lock, wait for all I/O
  *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
  * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  */
 
 avl_tree_t spa_namespace_avl;
 kmutex_t spa_namespace_lock;
 kcondvar_t spa_namespace_cv;
 static const int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 static kmutex_t spa_spare_lock;
 static avl_tree_t spa_spare_avl;
 static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
 spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
 
 #ifdef ZFS_DEBUG
 /*
  * Everything except dprintf, set_error, spa, and indirect_remap is on
  * by default in debug builds.
  */
 int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
     ZFS_DEBUG_INDIRECT_REMAP);
 #else
 int zfs_flags = 0;
 #endif
 
 /*
  * zfs_recover can be set to nonzero to attempt to recover from
  * otherwise-fatal errors, typically caused by on-disk corruption.  When
  * set, calls to zfs_panic_recover() will turn into warning messages.
  * This should only be used as a last resort, as it typically results
  * in leaked space, or worse.
  */
 int zfs_recover = B_FALSE;
 
 /*
  * If destroy encounters an EIO while reading metadata (e.g. indirect
  * blocks), space referenced by the missing metadata can not be freed.
  * Normally this causes the background destroy to become "stalled", as
  * it is unable to make forward progress.  While in this stalled state,
  * all remaining space to free from the error-encountering filesystem is
  * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
  * permanently leak the space from indirect blocks that can not be read,
  * and continue to free everything else that it can.
  *
  * The default, "stalling" behavior is useful if the storage partially
  * fails (i.e. some but not all i/os fail), and then later recovers.  In
  * this case, we will be able to continue pool operations while it is
  * partially failed, and when it recovers, we can continue to free the
  * space, with no leaks.  However, note that this case is actually
  * fairly rare.
  *
  * Typically pools either (a) fail completely (but perhaps temporarily,
  * e.g. a top-level vdev going offline), or (b) have localized,
  * permanent errors (e.g. disk returns the wrong data due to bit flip or
  * firmware bug).  In case (a), this setting does not matter because the
  * pool will be suspended and the sync thread will not be able to make
  * forward progress regardless.  In case (b), because the error is
  * permanent, the best we can do is leak the minimum amount of space,
  * which is what setting this flag will do.  Therefore, it is reasonable
  * for this flag to normally be set, but we chose the more conservative
  * approach of not setting it, so that there is no possibility of
  * leaking space in the "partial temporary" failure case.
  */
 int zfs_free_leak_on_eio = B_FALSE;
 
 /*
  * Expiration time in milliseconds. This value has two meanings. First it is
  * used to determine when the spa_deadman() logic should fire. By default the
  * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
  * Secondly, the value determines if an I/O is considered "hung". Any I/O that
  * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
  * in one of three behaviors controlled by zfs_deadman_failmode.
  */
 uint64_t zfs_deadman_synctime_ms = 600000UL;  /* 10 min. */
 
 /*
  * This value controls the maximum amount of time zio_wait() will block for an
  * outstanding IO.  By default this is 300 seconds at which point the "hung"
  * behavior will be applied as described for zfs_deadman_synctime_ms.
  */
 uint64_t zfs_deadman_ziotime_ms = 300000UL;  /* 5 min. */
 
 /*
  * Check time in milliseconds. This defines the frequency at which we check
  * for hung I/O.
  */
 uint64_t zfs_deadman_checktime_ms = 60000UL;  /* 1 min. */
 
 /*
  * By default the deadman is enabled.
  */
 int zfs_deadman_enabled = B_TRUE;
 
 /*
  * Controls the behavior of the deadman when it detects a "hung" I/O.
  * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
  *
  * wait     - Wait for the "hung" I/O (default)
  * continue - Attempt to recover from a "hung" I/O
  * panic    - Panic the system
  */
 const char *zfs_deadman_failmode = "wait";
 
 /*
  * The worst case is single-sector max-parity RAID-Z blocks, in which
  * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
  * times the size; so just assume that.  Add to this the fact that
  * we can have up to 3 DVAs per bp, and one more factor of 2 because
  * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
  * the worst case is:
  *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
  */
 uint_t spa_asize_inflation = 24;
 
 /*
  * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
  * the pool to be consumed (bounded by spa_max_slop).  This ensures that we
  * don't run the pool completely out of space, due to unaccounted changes (e.g.
  * to the MOS).  It also limits the worst-case time to allocate space.  If we
  * have less than this amount of free space, most ZPL operations (e.g.  write,
  * create) will return ENOSPC.  The ZIL metaslabs (spa_embedded_log_class) are
  * also part of this 3.2% of space which can't be consumed by normal writes;
  * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded
  * log space.
  *
  * Certain operations (e.g. file removal, most administrative actions) can
  * use half the slop space.  They will only return ENOSPC if less than half
  * the slop space is free.  Typically, once the pool has less than the slop
  * space free, the user will use these operations to free up space in the pool.
  * These are the operations that call dsl_pool_adjustedsize() with the netfree
  * argument set to TRUE.
  *
  * Operations that are almost guaranteed to free up space in the absence of
  * a pool checkpoint can use up to three quarters of the slop space
  * (e.g zfs destroy).
  *
  * A very restricted set of operations are always permitted, regardless of
  * the amount of free space.  These are the operations that call
  * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
  * increase in the amount of space used, it is possible to run the pool
  * completely out of space, causing it to be permanently read-only.
  *
  * Note that on very small pools, the slop space will be larger than
  * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
  * but we never allow it to be more than half the pool size.
  *
  * Further, on very large pools, the slop space will be smaller than
  * 3.2%, to avoid reserving much more space than we actually need; bounded
  * by spa_max_slop (128GB).
  *
  * See also the comments in zfs_space_check_t.
  */
 uint_t spa_slop_shift = 5;
 static const uint64_t spa_min_slop = 128ULL * 1024 * 1024;
 static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024;
 
 /*
  * Number of allocators to use, per spa instance
  */
 static int spa_num_allocators = 4;
 static int spa_cpus_per_allocator = 4;
 
 /*
  * Spa active allocator.
  * Valid values are zfs_active_allocator=<dynamic|cursor|new-dynamic>.
  */
 const char *zfs_active_allocator = "dynamic";
 
 void
 spa_load_failed(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 }
 
 void
 spa_load_note(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
 	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
 
 	spa_import_progress_set_notes_nolog(spa, "%s", buf);
 }
 
 /*
  * By default dedup and user data indirects land in the special class
  */
 static int zfs_ddt_data_is_special = B_TRUE;
 static int zfs_user_indirect_is_special = B_TRUE;
 
 /*
  * The percentage of special class final space reserved for metadata only.
  * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
  * let metadata into the class.
  */
 static uint_t zfs_special_class_metadata_reserve_pct = 25;
 
 /*
  * ==========================================================================
  * SPA config locking
  * ==========================================================================
  */
 static void
 spa_config_lock_init(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
 		scl->scl_writer = NULL;
 		scl->scl_write_wanted = 0;
 		scl->scl_count = 0;
 	}
 }
 
 static void
 spa_config_lock_destroy(spa_t *spa)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_destroy(&scl->scl_lock);
 		cv_destroy(&scl->scl_cv);
 		ASSERT(scl->scl_writer == NULL);
 		ASSERT(scl->scl_write_wanted == 0);
 		ASSERT(scl->scl_count == 0);
 	}
 }
 
 int
 spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			if (scl->scl_writer || scl->scl_write_wanted) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			if (scl->scl_count != 0) {
 				mutex_exit(&scl->scl_lock);
 				spa_config_exit(spa, locks & ((1 << i) - 1),
 				    tag);
 				return (0);
 			}
 			scl->scl_writer = curthread;
 		}
 		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	return (1);
 }
 
 static void
 spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw,
     int mmp_flag)
 {
 	(void) tag;
 	int wlocks_held = 0;
 
 	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (scl->scl_writer == curthread)
 			wlocks_held |= (1 << i);
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		if (rw == RW_READER) {
 			while (scl->scl_writer ||
 			    (!mmp_flag && scl->scl_write_wanted)) {
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			while (scl->scl_count != 0) {
 				scl->scl_write_wanted++;
 				cv_wait(&scl->scl_cv, &scl->scl_lock);
 				scl->scl_write_wanted--;
 			}
 			scl->scl_writer = curthread;
 		}
 		scl->scl_count++;
 		mutex_exit(&scl->scl_lock);
 	}
 	ASSERT3U(wlocks_held, <=, locks);
 }
 
 void
 spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	spa_config_enter_impl(spa, locks, tag, rw, 0);
 }
 
 /*
  * The spa_config_enter_mmp() allows the mmp thread to cut in front of
  * outstanding write lock requests. This is needed since the mmp updates are
  * time sensitive and failure to service them promptly will result in a
  * suspended pool. This pool suspension has been seen in practice when there is
  * a single disk in a pool that is responding slowly and presumably about to
  * fail.
  */
 
 void
 spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw)
 {
 	spa_config_enter_impl(spa, locks, tag, rw, 1);
 }
 
 void
 spa_config_exit(spa_t *spa, int locks, const void *tag)
 {
 	(void) tag;
 	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
 		ASSERT(scl->scl_count > 0);
 		if (--scl->scl_count == 0) {
 			ASSERT(scl->scl_writer == NULL ||
 			    scl->scl_writer == curthread);
 			scl->scl_writer = NULL;	/* OK in either case */
 			cv_broadcast(&scl->scl_cv);
 		}
 		mutex_exit(&scl->scl_lock);
 	}
 }
 
 int
 spa_config_held(spa_t *spa, int locks, krw_t rw)
 {
 	int locks_held = 0;
 
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (!(locks & (1 << i)))
 			continue;
 		if ((rw == RW_READER && scl->scl_count != 0) ||
 		    (rw == RW_WRITER && scl->scl_writer == curthread))
 			locks_held |= 1 << i;
 	}
 
 	return (locks_held);
 }
 
 /*
  * ==========================================================================
  * SPA namespace functions
  * ==========================================================================
  */
 
 /*
  * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
  * Returns NULL if no matching spa_t is found.
  */
 spa_t *
 spa_lookup(const char *name)
 {
 	static spa_t search;	/* spa_t is large; don't allocate on stack */
 	spa_t *spa;
 	avl_index_t where;
 	char *cp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 retry:
 	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 
 	/*
 	 * If it's a full dataset name, figure out the pool name and
 	 * just use that.
 	 */
 	cp = strpbrk(search.spa_name, "/@#");
 	if (cp != NULL)
 		*cp = '\0';
 
 	spa = avl_find(&spa_namespace_avl, &search, &where);
 	if (spa == NULL)
 		return (NULL);
 
 	/*
 	 * Avoid racing with import/export, which don't hold the namespace
 	 * lock for their entire duration.
 	 */
 	if ((spa->spa_load_thread != NULL &&
 	    spa->spa_load_thread != curthread) ||
 	    (spa->spa_export_thread != NULL &&
 	    spa->spa_export_thread != curthread)) {
 		cv_wait(&spa_namespace_cv, &spa_namespace_lock);
 		goto retry;
 	}
 
 	return (spa);
 }
 
 /*
  * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
  * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
  * looking for potentially hung I/Os.
  */
 void
 spa_deadman(void *arg)
 {
 	spa_t *spa = arg;
 
 	/* Disable the deadman if the pool is suspended. */
 	if (spa_suspended(spa))
 		return;
 
 	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
 	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
 	    (u_longlong_t)++spa->spa_deadman_calls);
 	if (zfs_deadman_enabled)
 		vdev_deadman(spa->spa_root_vdev, FTAG);
 
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    MSEC_TO_TICK(zfs_deadman_checktime_ms));
 }
 
 static int
 spa_log_sm_sort_by_txg(const void *va, const void *vb)
 {
 	const spa_log_sm_t *a = va;
 	const spa_log_sm_t *b = vb;
 
 	return (TREE_CMP(a->sls_txg, b->sls_txg));
 }
 
 /*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
  */
 spa_t *
 spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_create(&spa->spa_free_bplist[t]);
 
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
 	spa->spa_load_max_txg = UINT64_MAX;
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 	spa->spa_trust_config = B_TRUE;
 	spa->spa_hostid = zone_get_hostid(NULL);
 
 	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
 	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
 	spa_set_deadman_failmode(spa, zfs_deadman_failmode);
 	spa_set_allocator(spa, zfs_active_allocator);
 
 	zfs_refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 	spa_stats_init(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	avl_add(&spa_namespace_avl, spa);
 
 	/*
 	 * Set the alternate root, if there is one.
 	 */
 	if (altroot)
 		spa->spa_root = spa_strdup(altroot);
 
 	/* Do not allow more allocators than fraction of CPUs. */
 	spa->spa_alloc_count = MAX(MIN(spa_num_allocators,
 	    boot_ncpus / MAX(spa_cpus_per_allocator, 1)), 1);
 
 	spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (spa_alloc_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 	}
 	if (spa->spa_alloc_count > 1) {
 		spa->spa_allocs_use = kmem_zalloc(offsetof(spa_allocs_use_t,
 		    sau_inuse[spa->spa_alloc_count]), KM_SLEEP);
 		mutex_init(&spa->spa_allocs_use->sau_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 	}
 
 	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
 	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
 	    sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
 	list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
 	    offsetof(log_summary_entry_t, lse_node));
 
 	/*
 	 * Every pool starts with the default cachefile
 	 */
 	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
 	    offsetof(spa_config_dirent_t, scd_link));
 
 	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
 	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
 	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
 	    KM_SLEEP) == 0);
 
 	if (config != NULL) {
 		nvlist_t *features;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) == 0) {
 			VERIFY(nvlist_dup(features, &spa->spa_label_features,
 			    0) == 0);
 		}
 
 		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
 	}
 
 	if (spa->spa_label_features == NULL) {
 		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 	}
 
 	spa->spa_min_ashift = INT_MAX;
 	spa->spa_max_ashift = 0;
 	spa->spa_min_alloc = INT_MAX;
 	spa->spa_gcd_alloc = INT_MAX;
 
 	/* Reset cached value */
 	spa->spa_dedup_dspace = ~0ULL;
 
 	/*
 	 * As a pool is being created, treat all features as disabled by
 	 * setting SPA_FEATURE_DISABLED for all entries in the feature
 	 * refcount cache.
 	 */
 	for (int i = 0; i < SPA_FEATURES; i++) {
 		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
 	}
 
 	list_create(&spa->spa_leaf_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_leaf_node));
 
 	return (spa);
 }
 
 /*
  * Removes a spa_t from the namespace, freeing up any memory used.  Requires
  * spa_namespace_lock.  This is called only after the spa_t has been closed and
  * deactivated.
  */
 void
 spa_remove(spa_t *spa)
 {
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
 	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
 	ASSERT0(spa->spa_waiters);
 
 	nvlist_free(spa->spa_config_splitting);
 
 	avl_remove(&spa_namespace_avl, spa);
 
 	if (spa->spa_root)
 		spa_strfree(spa->spa_root);
 
 	while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path != NULL)
 			spa_strfree(dp->scd_path);
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		avl_destroy(&spa->spa_allocs[i].spaa_tree);
 		mutex_destroy(&spa->spa_allocs[i].spaa_lock);
 	}
 	kmem_free(spa->spa_allocs, spa->spa_alloc_count *
 	    sizeof (spa_alloc_t));
 	if (spa->spa_alloc_count > 1) {
 		mutex_destroy(&spa->spa_allocs_use->sau_lock);
 		kmem_free(spa->spa_allocs_use, offsetof(spa_allocs_use_t,
 		    sau_inuse[spa->spa_alloc_count]));
 	}
 
 	avl_destroy(&spa->spa_metaslabs_by_flushed);
 	avl_destroy(&spa->spa_sm_logs_by_txg);
 	list_destroy(&spa->spa_log_summary);
 	list_destroy(&spa->spa_config_list);
 	list_destroy(&spa->spa_leaf_list);
 
 	nvlist_free(spa->spa_label_features);
 	nvlist_free(spa->spa_load_info);
 	nvlist_free(spa->spa_feat_stats);
 	spa_config_set(spa, NULL);
 
 	zfs_refcount_destroy(&spa->spa_refcount);
 
 	spa_stats_destroy(spa);
 	spa_config_lock_destroy(spa);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		bplist_destroy(&spa->spa_free_bplist[t]);
 
 	zio_checksum_templates_free(spa);
 
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 	cv_destroy(&spa->spa_activities_cv);
 	cv_destroy(&spa->spa_waiters_cv);
 
 	mutex_destroy(&spa->spa_flushed_ms_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_evicting_os_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
 	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
 	mutex_destroy(&spa->spa_feat_stats_lock);
 	mutex_destroy(&spa->spa_activities_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
 
 /*
  * Given a pool, return the next pool in the namespace, or NULL if there is
  * none.  If 'prev' is NULL, return the first pool.
  */
 spa_t *
 spa_next(spa_t *prev)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	if (prev)
 		return (AVL_NEXT(&spa_namespace_avl, prev));
 	else
 		return (avl_first(&spa_namespace_avl));
 }
 
 /*
  * ==========================================================================
  * SPA refcount functions
  * ==========================================================================
  */
 
 /*
  * Add a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held.
  */
 void
 spa_open_ref(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_load_thread == curthread);
 	(void) zfs_refcount_add(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t.  Must have at least one reference, or
  * have the namespace lock held or be part of a pool import/export.
  */
 void
 spa_close(spa_t *spa, const void *tag)
 {
 	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_load_thread == curthread ||
 	    spa->spa_export_thread == curthread);
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Remove a reference to the given spa_t held by a dsl dir that is
  * being asynchronously released.  Async releases occur from a taskq
  * performing eviction of dsl datasets and dirs.  The namespace lock
  * isn't held and the hold by the object being evicted may contribute to
  * spa_minref (e.g. dataset or directory released during pool export),
  * so the asserts in spa_close() do not apply.
  */
 void
 spa_async_close(spa_t *spa, const void *tag)
 {
 	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held or be the spa export thread.  We really
  * compare against spa_minref, which is the  number of references
  * acquired when opening a pool
  */
 boolean_t
 spa_refcount_zero(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
 
 /*
  * ==========================================================================
  * SPA spare and l2cache tracking
  * ==========================================================================
  */
 
 /*
  * Hot spares and cache devices are tracked using the same code below,
  * for 'auxiliary' devices.
  */
 
 typedef struct spa_aux {
 	uint64_t	aux_guid;
 	uint64_t	aux_pool;
 	avl_node_t	aux_avl;
 	int		aux_count;
 } spa_aux_t;
 
 static inline int
 spa_aux_compare(const void *a, const void *b)
 {
 	const spa_aux_t *sa = (const spa_aux_t *)a;
 	const spa_aux_t *sb = (const spa_aux_t *)b;
 
 	return (TREE_CMP(sa->aux_guid, sb->aux_guid));
 }
 
 static void
 spa_aux_add(vdev_t *vd, avl_tree_t *avl)
 {
 	avl_index_t where;
 	spa_aux_t search;
 	spa_aux_t *aux;
 
 	search.aux_guid = vd->vdev_guid;
 	if ((aux = avl_find(avl, &search, &where)) != NULL) {
 		aux->aux_count++;
 	} else {
 		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
 		aux->aux_guid = vd->vdev_guid;
 		aux->aux_count = 1;
 		avl_insert(avl, aux, where);
 	}
 }
 
 static void
 spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search;
 	spa_aux_t *aux;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	aux = avl_find(avl, &search, &where);
 
 	ASSERT(aux != NULL);
 
 	if (--aux->aux_count == 0) {
 		avl_remove(avl, aux);
 		kmem_free(aux, sizeof (spa_aux_t));
 	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
 		aux->aux_pool = 0ULL;
 	}
 }
 
 static boolean_t
 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 
 	search.aux_guid = guid;
 	found = avl_find(avl, &search, NULL);
 
 	if (pool) {
 		if (found)
 			*pool = found->aux_pool;
 		else
 			*pool = 0ULL;
 	}
 
 	if (refcnt) {
 		if (found)
 			*refcnt = found->aux_count;
 		else
 			*refcnt = 0;
 	}
 
 	return (found != NULL);
 }
 
 static void
 spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
 {
 	spa_aux_t search, *found;
 	avl_index_t where;
 
 	search.aux_guid = vd->vdev_guid;
 	found = avl_find(avl, &search, &where);
 	ASSERT(found != NULL);
 	ASSERT(found->aux_pool == 0ULL);
 
 	found->aux_pool = spa_guid(vd->vdev_spa);
 }
 
 /*
  * Spares are tracked globally due to the following constraints:
  *
  *	- A spare may be part of multiple pools.
  *	- A spare may be added to a pool even if it's actively in use within
  *	  another pool.
  *	- A spare in use in any pool can only be the source of a replacement if
  *	  the target is a spare in the same pool.
  *
  * We keep track of all spares on the system through the use of a reference
  * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
  * spare, then we bump the reference count in the AVL tree.  In addition, we set
  * the 'vdev_isspare' member to indicate that the device is a spare (active or
  * inactive).  When a spare is made active (used to replace a device in the
  * pool), we also keep track of which pool its been made a part of.
  *
  * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
  * called under the spa_namespace lock as part of vdev reconfiguration.  The
  * separate spare lock exists for the status query path, which does not need to
  * be completely consistent with respect to other vdev configuration changes.
  */
 
 static int
 spa_spare_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_spare_add(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(!vd->vdev_isspare);
 	spa_aux_add(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_TRUE;
 	mutex_exit(&spa_spare_lock);
 }
 
 void
 spa_spare_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_remove(vd, &spa_spare_avl);
 	vd->vdev_isspare = B_FALSE;
 	mutex_exit(&spa_spare_lock);
 }
 
 boolean_t
 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_spare_lock);
 	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 
 	return (found);
 }
 
 void
 spa_spare_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_spare_lock);
 	ASSERT(vd->vdev_isspare);
 	spa_aux_activate(vd, &spa_spare_avl);
 	mutex_exit(&spa_spare_lock);
 }
 
 /*
  * Level 2 ARC devices are tracked globally for the same reasons as spares.
  * Cache devices currently only support one pool per cache device, and so
  * for these devices the aux reference count is currently unused beyond 1.
  */
 
 static int
 spa_l2cache_compare(const void *a, const void *b)
 {
 	return (spa_aux_compare(a, b));
 }
 
 void
 spa_l2cache_add(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(!vd->vdev_isl2cache);
 	spa_aux_add(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_TRUE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 void
 spa_l2cache_remove(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_remove(vd, &spa_l2cache_avl);
 	vd->vdev_isl2cache = B_FALSE;
 	mutex_exit(&spa_l2cache_lock);
 }
 
 boolean_t
 spa_l2cache_exists(uint64_t guid, uint64_t *pool)
 {
 	boolean_t found;
 
 	mutex_enter(&spa_l2cache_lock);
 	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 
 	return (found);
 }
 
 void
 spa_l2cache_activate(vdev_t *vd)
 {
 	mutex_enter(&spa_l2cache_lock);
 	ASSERT(vd->vdev_isl2cache);
 	spa_aux_activate(vd, &spa_l2cache_avl);
 	mutex_exit(&spa_l2cache_lock);
 }
 
 /*
  * ==========================================================================
  * SPA vdev locking
  * ==========================================================================
  */
 
 /*
  * Lock the given spa_t for the purpose of adding or removing a vdev.
  * Grabs the global spa_namespace_lock plus the spa config lock for writing.
  * It returns the next transaction group for the spa_t.
  */
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
 	ASSERT0(spa->spa_export_thread);
 
 	vdev_autotrim_stop_all(spa);
 
 	return (spa_vdev_config_enter(spa));
 }
 
 /*
  * The same as spa_vdev_enter() above but additionally takes the guid of
  * the vdev being detached.  When there is a rebuild in process it will be
  * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
  * The rebuild is canceled if only a single child remains after the detach.
  */
 uint64_t
 spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
 {
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 
 	ASSERT0(spa->spa_export_thread);
 
 	vdev_autotrim_stop_all(spa);
 
 	if (guid != 0) {
 		vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd) {
 			vdev_rebuild_stop_wait(vd->vdev_top);
 		}
 	}
 
 	return (spa_vdev_config_enter(spa));
 }
 
 /*
  * Internal implementation for spa_vdev_enter().  Used when a vdev
  * operation requires multiple syncs (i.e. removing a device) while
  * keeping the spa_namespace_lock held.
  */
 uint64_t
 spa_vdev_config_enter(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	return (spa_last_synced_txg(spa) + 1);
 }
 
 /*
  * Used in combination with spa_vdev_config_enter() to allow the syncing
  * of multiple transactions without releasing the spa_namespace_lock.
  */
 void
 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error,
     const char *tag)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	int config_changed = B_FALSE;
 
 	ASSERT(txg > spa_last_synced_txg(spa));
 
 	spa->spa_pending_vdev = NULL;
 
 	/*
 	 * Reassess the DTLs.
 	 */
 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
 
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	/*
 	 * Verify the metaslab classes.
 	 */
 	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
 	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
 
 	spa_config_exit(spa, SCL_ALL, spa);
 
 	/*
 	 * Panic the system if the specified tag requires it.  This
 	 * is useful for ensuring that configurations are updated
 	 * transactionally.
 	 */
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, tag, 0);
 
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
 	 * This allows us to use the txg as the generation number.
 	 */
 	if (error == 0)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
 		if (vd->vdev_ops->vdev_op_leaf) {
 			mutex_enter(&vd->vdev_initialize_lock);
 			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
 			    NULL);
 			mutex_exit(&vd->vdev_initialize_lock);
 
 			mutex_enter(&vd->vdev_trim_lock);
 			vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
 			mutex_exit(&vd->vdev_trim_lock);
 		}
 
 		/*
 		 * The vdev may be both a leaf and top-level device.
 		 */
 		vdev_autotrim_stop_wait(vd);
 
 		spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_STATE_ALL, spa);
 	}
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed)
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 }
 
 /*
  * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
  * locking of spa_vdev_enter(), we also want make sure the transactions have
  * synced to disk, and then update the global configuration cache with the new
  * information.
  */
 int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
 	vdev_autotrim_restart(spa);
 	vdev_rebuild_restart(spa);
 
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
 spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
 	int locks = SCL_STATE_ALL | oplocks;
 
 	/*
 	 * Root pools may need to read of the underlying devfs filesystem
 	 * when opening up a vdev.  Unfortunately if we're holding the
 	 * SCL_ZIO lock it will result in a deadlock when we try to issue
 	 * the read from the root filesystem.  Instead we "prefetch"
 	 * the associated vnodes that we need prior to opening the
 	 * underlying devices and cache them so that we can prevent
 	 * any I/O when we are doing the actual open.
 	 */
 	if (spa_is_root(spa)) {
 		int low = locks & ~(SCL_ZIO - 1);
 		int high = locks & ~low;
 
 		spa_config_enter(spa, high, spa, RW_WRITER);
 		vdev_hold(spa->spa_root_vdev);
 		spa_config_enter(spa, low, spa, RW_WRITER);
 	} else {
 		spa_config_enter(spa, locks, spa, RW_WRITER);
 	}
 	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
 	boolean_t config_changed = B_FALSE;
 	vdev_t *vdev_top;
 
 	if (vd == NULL || vd == spa->spa_root_vdev) {
 		vdev_top = spa->spa_root_vdev;
 	} else {
 		vdev_top = vd->vdev_top;
 	}
 
 	if (vd != NULL || error == 0)
 		vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
 
 	if (vd != NULL) {
 		if (vd != spa->spa_root_vdev)
 			vdev_state_dirty(vdev_top);
 
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
 	if (spa_is_root(spa))
 		vdev_rele(spa->spa_root_vdev);
 
 	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
 	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
 	/*
 	 * If anything changed, wait for it to sync.  This ensures that,
 	 * from the system administrator's perspective, zpool(8) commands
 	 * are synchronous.  This is important for things like zpool offline:
 	 * when the command completes, you expect no further I/O from ZFS.
 	 */
 	if (vd != NULL)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	/*
 	 * If the config changed, update the config cache.
 	 */
 	if (config_changed) {
 		mutex_enter(&spa_namespace_lock);
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous functions
  * ==========================================================================
  */
 
 void
 spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
 {
 	if (!nvlist_exists(spa->spa_label_features, feature)) {
 		fnvlist_add_boolean(spa->spa_label_features, feature);
 		/*
 		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
 		 * dirty the vdev config because lock SCL_CONFIG is not held.
 		 * Thankfully, in this case we don't need to dirty the config
 		 * because it will be written out anyway when we finish
 		 * creating the pool.
 		 */
 		if (tx->tx_txg != TXG_INITIAL)
 			vdev_config_dirty(spa->spa_root_vdev);
 	}
 }
 
 void
 spa_deactivate_mos_feature(spa_t *spa, const char *feature)
 {
 	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
 		vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * Return the spa_t associated with given pool_guid, if it exists.  If
  * device_guid is non-zero, determine whether the pool exists *and* contains
  * a device with the specified device_guid.
  */
 spa_t *
 spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 			continue;
 		if (spa->spa_root_vdev == NULL)
 			continue;
 		if (spa_guid(spa) == pool_guid) {
 			if (device_guid == 0)
 				break;
 
 			if (vdev_lookup_by_guid(spa->spa_root_vdev,
 			    device_guid) != NULL)
 				break;
 
 			/*
 			 * Check any devices we may be in the process of adding.
 			 */
 			if (spa->spa_pending_vdev) {
 				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
 				    device_guid) != NULL)
 					break;
 			}
 		}
 	}
 
 	return (spa);
 }
 
 /*
  * Determine whether a pool with the given pool_guid exists.
  */
 boolean_t
 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
 {
 	return (spa_by_guid(pool_guid, device_guid) != NULL);
 }
 
 char *
 spa_strdup(const char *s)
 {
 	size_t len;
 	char *new;
 
 	len = strlen(s);
 	new = kmem_alloc(len + 1, KM_SLEEP);
 	memcpy(new, s, len + 1);
 
 	return (new);
 }
 
 void
 spa_strfree(char *s)
 {
 	kmem_free(s, strlen(s) + 1);
 }
 
 uint64_t
 spa_generate_guid(spa_t *spa)
 {
 	uint64_t guid;
 
 	if (spa != NULL) {
 		do {
 			(void) random_get_pseudo_bytes((void *)&guid,
 			    sizeof (guid));
 		} while (guid == 0 || spa_guid_exists(spa_guid(spa), guid));
 	} else {
 		do {
 			(void) random_get_pseudo_bytes((void *)&guid,
 			    sizeof (guid));
 		} while (guid == 0 || spa_guid_exists(guid, 0));
 	}
 
 	return (guid);
 }
 
 static boolean_t
 spa_load_guid_exists(uint64_t guid)
 {
 	avl_tree_t *t = &spa_namespace_avl;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	for (spa_t *spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
 		if (spa_load_guid(spa) == guid)
 			return (B_TRUE);
 	}
 
 	return (arc_async_flush_guid_inuse(guid));
 }
 
 uint64_t
 spa_generate_load_guid(void)
 {
 	uint64_t guid;
 
 	do {
 		(void) random_get_pseudo_bytes((void *)&guid,
 		    sizeof (guid));
 	} while (guid == 0 || spa_load_guid_exists(guid));
 
 	return (guid);
 }
 
 void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
 	char type[256];
 	const char *checksum = NULL;
 	const char *compress = NULL;
 
 	if (bp != NULL) {
 		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
 			dmu_object_byteswap_t bswap =
 			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			(void) snprintf(type, sizeof (type), "bswap %s %s",
 			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
 			    "metadata" : "data",
 			    dmu_ot_byteswap[bswap].ob_name);
 		} else {
 			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
 			    sizeof (type));
 		}
 		if (!BP_IS_EMBEDDED(bp)) {
 			checksum =
 			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
 		}
 		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 	}
 
 	SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum,
 	    compress);
 }
 
 void
 spa_freeze(spa_t *spa)
 {
 	uint64_t freeze_txg = 0;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	if (spa->spa_freeze_txg == UINT64_MAX) {
 		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
 		spa->spa_freeze_txg = freeze_txg;
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (freeze_txg != 0)
 		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
 }
 
 void
 zfs_panic_recover(const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
 	va_end(adx);
 }
 
 /*
  * This is a stripped-down version of strtoull, suitable only for converting
  * lowercase hexadecimal numbers that don't overflow.
  */
 uint64_t
 zfs_strtonum(const char *str, char **nptr)
 {
 	uint64_t val = 0;
 	char c;
 	int digit;
 
 	while ((c = *str) != '\0') {
 		if (c >= '0' && c <= '9')
 			digit = c - '0';
 		else if (c >= 'a' && c <= 'f')
 			digit = 10 + c - 'a';
 		else
 			break;
 
 		val *= 16;
 		val += digit;
 
 		str++;
 	}
 
 	if (nptr)
 		*nptr = (char *)str;
 
 	return (val);
 }
 
 void
 spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
 {
 	/*
 	 * We bump the feature refcount for each special vdev added to the pool
 	 */
 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
 	spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
 }
 
 /*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
  */
 
 boolean_t
 spa_shutting_down(spa_t *spa)
 {
 	return (spa->spa_async_suspended);
 }
 
 dsl_pool_t *
 spa_get_dsl(spa_t *spa)
 {
 	return (spa->spa_dsl_pool);
 }
 
 boolean_t
 spa_is_initializing(spa_t *spa)
 {
 	return (spa->spa_is_initializing);
 }
 
 boolean_t
 spa_indirect_vdevs_loaded(spa_t *spa)
 {
 	return (spa->spa_indirect_vdevs_loaded);
 }
 
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
 	return (&spa->spa_ubsync.ub_rootbp);
 }
 
 void
 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
 {
 	spa->spa_uberblock.ub_rootbp = *bp;
 }
 
 void
 spa_altroot(spa_t *spa, char *buf, size_t buflen)
 {
 	if (spa->spa_root == NULL)
 		buf[0] = '\0';
 	else
 		(void) strlcpy(buf, spa->spa_root, buflen);
 }
 
 uint32_t
 spa_sync_pass(spa_t *spa)
 {
 	return (spa->spa_sync_pass);
 }
 
 char *
 spa_name(spa_t *spa)
 {
 	return (spa->spa_name);
 }
 
 uint64_t
 spa_guid(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t guid;
 
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
 	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
 	if (spa->spa_root_vdev == NULL)
 		return (spa->spa_config_guid);
 
 	guid = spa->spa_last_synced_guid != 0 ?
 	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
 
 	/*
 	 * Return the most recently synced out guid unless we're
 	 * in syncing context.
 	 */
 	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
 		return (guid);
 }
 
 uint64_t
 spa_load_guid(spa_t *spa)
 {
 	/*
 	 * This is a GUID that exists solely as a reference for the
 	 * purposes of the arc.  It is generated at load time, and
 	 * is never written to persistent storage.
 	 */
 	return (spa->spa_load_guid);
 }
 
 uint64_t
 spa_last_synced_txg(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_txg);
 }
 
 uint64_t
 spa_first_txg(spa_t *spa)
 {
 	return (spa->spa_first_txg);
 }
 
 uint64_t
 spa_syncing_txg(spa_t *spa)
 {
 	return (spa->spa_syncing_txg);
 }
 
 /*
  * Return the last txg where data can be dirtied. The final txgs
  * will be used to just clear out any deferred frees that remain.
  */
 uint64_t
 spa_final_dirty_txg(spa_t *spa)
 {
 	return (spa->spa_final_txg - TXG_DEFER_SIZE);
 }
 
 pool_state_t
 spa_state(spa_t *spa)
 {
 	return (spa->spa_state);
 }
 
 spa_load_state_t
 spa_load_state(spa_t *spa)
 {
 	return (spa->spa_load_state);
 }
 
 uint64_t
 spa_freeze_txg(spa_t *spa)
 {
 	return (spa->spa_freeze_txg);
 }
 
 /*
  * Return the inflated asize for a logical write in bytes. This is used by the
  * DMU to calculate the space a logical write will require on disk.
  * If lsize is smaller than the largest physical block size allocatable on this
  * pool we use its value instead, since the write will end up using the whole
  * block anyway.
  */
 uint64_t
 spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
 {
 	if (lsize == 0)
 		return (0);	/* No inflation needed */
 	return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
 }
 
 /*
  * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
  * (3.2%), minus the embedded log space.  On very small pools, it may be
  * slightly larger than this.  On very large pools, it will be capped to
  * the value of spa_max_slop.  The embedded log space is not included in
  * spa_dspace.  By subtracting it, the usable space (per "zfs list") is a
  * constant 97% of the total space, regardless of metaslab size (assuming the
  * default spa_slop_shift=5 and a non-tiny pool).
  *
  * See the comment above spa_slop_shift for more details.
  */
 uint64_t
 spa_get_slop_space(spa_t *spa)
 {
 	uint64_t space = 0;
 	uint64_t slop = 0;
 
 	/*
 	 * Make sure spa_dedup_dspace has been set.
 	 */
 	if (spa->spa_dedup_dspace == ~0ULL)
 		spa_update_dspace(spa);
 
 	space = spa->spa_rdspace;
 	slop = MIN(space >> spa_slop_shift, spa_max_slop);
 
 	/*
 	 * Subtract the embedded log space, but no more than half the (3.2%)
 	 * unusable space.  Note, the "no more than half" is only relevant if
 	 * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
 	 * default.
 	 */
 	uint64_t embedded_log =
 	    metaslab_class_get_dspace(spa_embedded_log_class(spa));
 	slop -= MIN(embedded_log, slop >> 1);
 
 	/*
 	 * Slop space should be at least spa_min_slop, but no more than half
 	 * the entire pool.
 	 */
 	slop = MAX(slop, MIN(space >> 1, spa_min_slop));
 	return (slop);
 }
 
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
 	return (spa->spa_dspace);
 }
 
 uint64_t
 spa_get_checkpoint_space(spa_t *spa)
 {
 	return (spa->spa_checkpoint_info.sci_dspace);
 }
 
 void
 spa_update_dspace(spa_t *spa)
 {
 	spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa));
 	if (spa->spa_nonallocating_dspace > 0) {
 		/*
 		 * Subtract the space provided by all non-allocating vdevs that
 		 * contribute to dspace.  If a file is overwritten, its old
 		 * blocks are freed and new blocks are allocated.  If there are
 		 * no snapshots of the file, the available space should remain
 		 * the same.  The old blocks could be freed from the
 		 * non-allocating vdev, but the new blocks must be allocated on
 		 * other (allocating) vdevs.  By reserving the entire size of
 		 * the non-allocating vdevs (including allocated space), we
 		 * ensure that there will be enough space on the allocating
 		 * vdevs for this file overwrite to succeed.
 		 *
 		 * Note that the DMU/DSL doesn't actually know or care
 		 * how much space is allocated (it does its own tracking
 		 * of how much space has been logically used).  So it
 		 * doesn't matter that the data we are moving may be
 		 * allocated twice (on the old device and the new device).
 		 */
 		ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace);
 		spa->spa_rdspace -= spa->spa_nonallocating_dspace;
 	}
 	spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) +
 	    brt_get_dspace(spa);
 }
 
 /*
  * Return the failure mode that has been set to this pool. The default
  * behavior will be to block all I/Os when a complete failure occurs.
  */
 uint64_t
 spa_get_failmode(spa_t *spa)
 {
 	return (spa->spa_failmode);
 }
 
 boolean_t
 spa_suspended(spa_t *spa)
 {
 	return (spa->spa_suspended != ZIO_SUSPEND_NONE);
 }
 
 uint64_t
 spa_version(spa_t *spa)
 {
 	return (spa->spa_ubsync.ub_version);
 }
 
 boolean_t
 spa_deflate(spa_t *spa)
 {
 	return (spa->spa_deflate);
 }
 
 metaslab_class_t *
 spa_normal_class(spa_t *spa)
 {
 	return (spa->spa_normal_class);
 }
 
 metaslab_class_t *
 spa_log_class(spa_t *spa)
 {
 	return (spa->spa_log_class);
 }
 
 metaslab_class_t *
 spa_embedded_log_class(spa_t *spa)
 {
 	return (spa->spa_embedded_log_class);
 }
 
 metaslab_class_t *
 spa_special_class(spa_t *spa)
 {
 	return (spa->spa_special_class);
 }
 
 metaslab_class_t *
 spa_dedup_class(spa_t *spa)
 {
 	return (spa->spa_dedup_class);
 }
 
 boolean_t
 spa_special_has_ddt(spa_t *spa)
 {
 	return (zfs_ddt_data_is_special &&
 	    spa->spa_special_class->mc_groups != 0);
 }
 
 /*
  * Locate an appropriate allocation class
  */
 metaslab_class_t *
 spa_preferred_class(spa_t *spa, const zio_t *zio)
 {
 	const zio_prop_t *zp = &zio->io_prop;
 
 	/*
 	 * Override object type for the purposes of selecting a storage class.
 	 * Primarily for DMU_OTN_ types where we can't explicitly control their
 	 * storage class; instead, choose a static type most closely matches
 	 * what we want.
 	 */
 	dmu_object_type_t objtype =
 	    zp->zp_storage_type == DMU_OT_NONE ?
 	    zp->zp_type : zp->zp_storage_type;
 
 	/*
 	 * ZIL allocations determine their class in zio_alloc_zil().
 	 */
 	ASSERT(objtype != DMU_OT_INTENT_LOG);
 
 	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
 
 	if (DMU_OT_IS_DDT(objtype)) {
 		if (spa->spa_dedup_class->mc_groups != 0)
 			return (spa_dedup_class(spa));
 		else if (has_special_class && zfs_ddt_data_is_special)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	/* Indirect blocks for user data can land in special if allowed */
 	if (zp->zp_level > 0 &&
 	    (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
 		if (has_special_class && zfs_user_indirect_is_special)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) {
 		if (has_special_class)
 			return (spa_special_class(spa));
 		else
 			return (spa_normal_class(spa));
 	}
 
 	/*
 	 * Allow small file blocks in special class in some cases (like
 	 * for the dRAID vdev feature). But always leave a reserve of
 	 * zfs_special_class_metadata_reserve_pct exclusively for metadata.
 	 */
 	if (DMU_OT_IS_FILE(objtype) &&
 	    has_special_class && zio->io_size <= zp->zp_zpl_smallblk) {
 		metaslab_class_t *special = spa_special_class(spa);
 		uint64_t alloc = metaslab_class_get_alloc(special);
 		uint64_t space = metaslab_class_get_space(special);
 		uint64_t limit =
 		    (space * (100 - zfs_special_class_metadata_reserve_pct))
 		    / 100;
 
 		if (alloc < limit)
 			return (special);
 	}
 
 	return (spa_normal_class(spa));
 }
 
 void
 spa_evicting_os_register(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_insert_head(&spa->spa_evicting_os_list, os);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_deregister(spa_t *spa, objset_t *os)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	list_remove(&spa->spa_evicting_os_list, os);
 	cv_broadcast(&spa->spa_evicting_os_cv);
 	mutex_exit(&spa->spa_evicting_os_lock);
 }
 
 void
 spa_evicting_os_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_evicting_os_lock);
 	while (!list_is_empty(&spa->spa_evicting_os_list))
 		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
 	mutex_exit(&spa->spa_evicting_os_lock);
 
 	dmu_buf_user_evict_wait();
 }
 
 int
 spa_max_replication(spa_t *spa)
 {
 	/*
 	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
 	 * handle BPs with more than one DVA allocated.  Set our max
 	 * replication level accordingly.
 	 */
 	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
 		return (1);
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
 int
 spa_prev_software_version(spa_t *spa)
 {
 	return (spa->spa_prev_software_version);
 }
 
 uint64_t
 spa_deadman_synctime(spa_t *spa)
 {
 	return (spa->spa_deadman_synctime);
 }
 
 spa_autotrim_t
 spa_get_autotrim(spa_t *spa)
 {
 	return (spa->spa_autotrim);
 }
 
 uint64_t
 spa_deadman_ziotime(spa_t *spa)
 {
 	return (spa->spa_deadman_ziotime);
 }
 
 uint64_t
 spa_get_deadman_failmode(spa_t *spa)
 {
 	return (spa->spa_deadman_failmode);
 }
 
 void
 spa_set_deadman_failmode(spa_t *spa, const char *failmode)
 {
 	if (strcmp(failmode, "wait") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
 	else if (strcmp(failmode, "continue") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	else if (strcmp(failmode, "panic") == 0)
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
 	else
 		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
 }
 
 void
 spa_set_deadman_ziotime(hrtime_t ns)
 {
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_ziotime = ns;
 		mutex_exit(&spa_namespace_lock);
 	}
 }
 
 void
 spa_set_deadman_synctime(hrtime_t ns)
 {
 	spa_t *spa = NULL;
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa->spa_deadman_synctime = ns;
 		mutex_exit(&spa_namespace_lock);
 	}
 }
 
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
 	uint64_t asize = DVA_GET_ASIZE(dva);
 	uint64_t dsize = asize;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (asize != 0 && spa->spa_deflate) {
 		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 		if (vd != NULL)
 			dsize = (asize >> SPA_MINBLOCKSHIFT) *
 			    vd->vdev_deflate_ratio;
 	}
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	return (dsize);
 }
 
 uint64_t
 bp_get_dsize(spa_t *spa, const blkptr_t *bp)
 {
 	uint64_t dsize = 0;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (dsize);
 }
 
 uint64_t
 spa_dirty_data(spa_t *spa)
 {
 	return (spa->spa_dsl_pool->dp_dirty_total);
 }
 
 /*
  * ==========================================================================
  * SPA Import Progress Routines
  * ==========================================================================
  */
 
 typedef struct spa_import_progress {
 	uint64_t		pool_guid;	/* unique id for updates */
 	char			*pool_name;
 	spa_load_state_t	spa_load_state;
 	char			*spa_load_notes;
 	uint64_t		mmp_sec_remaining;	/* MMP activity check */
 	uint64_t		spa_load_max_txg;	/* rewind txg */
 	procfs_list_node_t	smh_node;
 } spa_import_progress_t;
 
 spa_history_list_t *spa_import_progress_list = NULL;
 
 static int
 spa_import_progress_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid",
 	    "load_state", "multihost_secs", "max_txg",
 	    "pool_name", "notes");
 	return (0);
 }
 
 static int
 spa_import_progress_show(struct seq_file *f, void *data)
 {
 	spa_import_progress_t *sip = (spa_import_progress_t *)data;
 
 	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n",
 	    (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
 	    (u_longlong_t)sip->mmp_sec_remaining,
 	    (u_longlong_t)sip->spa_load_max_txg,
 	    (sip->pool_name ? sip->pool_name : "-"),
 	    (sip->spa_load_notes ? sip->spa_load_notes : "-"));
 
 	return (0);
 }
 
 /* Remove oldest elements from list until there are no more than 'size' left */
 static void
 spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
 {
 	spa_import_progress_t *sip;
 	while (shl->size > size) {
 		sip = list_remove_head(&shl->procfs_list.pl_list);
 		if (sip->pool_name)
 			spa_strfree(sip->pool_name);
 		if (sip->spa_load_notes)
 			kmem_strfree(sip->spa_load_notes);
 		kmem_free(sip, sizeof (spa_import_progress_t));
 		shl->size--;
 	}
 
 	IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list));
 }
 
 static void
 spa_import_progress_init(void)
 {
 	spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t),
 	    KM_SLEEP);
 
 	spa_import_progress_list->size = 0;
 
 	spa_import_progress_list->procfs_list.pl_private =
 	    spa_import_progress_list;
 
 	procfs_list_install("zfs",
 	    NULL,
 	    "import_progress",
 	    0644,
 	    &spa_import_progress_list->procfs_list,
 	    spa_import_progress_show,
 	    spa_import_progress_show_header,
 	    NULL,
 	    offsetof(spa_import_progress_t, smh_node));
 }
 
 static void
 spa_import_progress_destroy(void)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	procfs_list_uninstall(&shl->procfs_list);
 	spa_import_progress_truncate(shl, 0);
 	procfs_list_destroy(&shl->procfs_list);
 	kmem_free(shl, sizeof (spa_history_list_t));
 }
 
 int
 spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t load_state)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_state = load_state;
 			if (sip->spa_load_notes != NULL) {
 				kmem_strfree(sip->spa_load_notes);
 				sip->spa_load_notes = NULL;
 			}
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 static void
 spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg,
     const char *fmt, va_list adx)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	uint64_t pool_guid = spa_guid(spa);
 
 	if (shl->size == 0)
 		return;
 
 	char *notes = kmem_vasprintf(fmt, adx);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			if (sip->spa_load_notes != NULL) {
 				kmem_strfree(sip->spa_load_notes);
 				sip->spa_load_notes = NULL;
 			}
 			sip->spa_load_notes = notes;
 			if (log_dbgmsg)
 				zfs_dbgmsg("'%s' %s", sip->pool_name, notes);
 			notes = NULL;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 	if (notes != NULL)
 		kmem_strfree(notes);
 }
 
 void
 spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx);
 	va_end(adx);
 }
 
 void
 spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx);
 	va_end(adx);
 }
 
 int
 spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->spa_load_max_txg = load_max_txg;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 int
 spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	int error = ENOENT;
 
 	if (shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			sip->mmp_sec_remaining = mmp_sec_remaining;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 /*
  * A new import is in progress, add an entry.
  */
 void
 spa_import_progress_add(spa_t *spa)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 	const char *poolname = NULL;
 
 	sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
 	sip->pool_guid = spa_guid(spa);
 
 	(void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
 	    &poolname);
 	if (poolname == NULL)
 		poolname = spa_name(spa);
 	sip->pool_name = spa_strdup(poolname);
 	sip->spa_load_state = spa_load_state(spa);
 	sip->spa_load_notes = NULL;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, sip);
 	shl->size++;
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 void
 spa_import_progress_remove(uint64_t pool_guid)
 {
 	spa_history_list_t *shl = spa_import_progress_list;
 	spa_import_progress_t *sip;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
 	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
 		if (sip->pool_guid == pool_guid) {
 			if (sip->pool_name)
 				spa_strfree(sip->pool_name);
 			if (sip->spa_load_notes)
 				spa_strfree(sip->spa_load_notes);
 			list_remove(&shl->procfs_list.pl_list, sip);
 			shl->size--;
 			kmem_free(sip, sizeof (spa_import_progress_t));
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
  * ==========================================================================
  * Initialization and Termination
  * ==========================================================================
  */
 
 static int
 spa_name_compare(const void *a1, const void *a2)
 {
 	const spa_t *s1 = a1;
 	const spa_t *s2 = a2;
 	int s;
 
 	s = strcmp(s1->spa_name, s2->spa_name);
 
 	return (TREE_ISIGN(s));
 }
 
 void
 spa_boot_init(void)
 {
 	spa_config_load();
 }
 
 void
 spa_init(spa_mode_t mode)
 {
 	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
 
 	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
 	    offsetof(spa_t, spa_avl));
 
 	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
 	spa_mode_global = mode;
 
 #ifndef _KERNEL
 	if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) {
 		struct sigaction sa;
 
 		sa.sa_flags = SA_SIGINFO;
 		sigemptyset(&sa.sa_mask);
 		sa.sa_sigaction = arc_buf_sigsegv;
 
 		if (sigaction(SIGSEGV, &sa, NULL) == -1) {
 			perror("could not enable watchpoints: "
 			    "sigaction(SIGSEGV, ...) = ");
 		} else {
 			arc_watch = B_TRUE;
 		}
 	}
 #endif
 
 	fm_init();
 	zfs_refcount_init();
 	unique_init();
 	zfs_btree_init();
 	metaslab_stat_init();
 	brt_init();
 	ddt_init();
 	zio_init();
 	dmu_init();
 	zil_init();
 	vdev_mirror_stat_init();
 	vdev_raidz_math_init();
 	vdev_file_init();
 	zfs_prop_init();
 	chksum_init();
 	zpool_prop_init();
 	zpool_feature_init();
 	spa_config_load();
 	vdev_prop_init();
 	l2arc_start();
 	scan_init();
 	qat_init();
 	spa_import_progress_init();
 	zap_init();
 }
 
 void
 spa_fini(void)
 {
 	l2arc_stop();
 
 	spa_evict_all();
 
 	vdev_file_fini();
 	vdev_mirror_stat_fini();
 	vdev_raidz_math_fini();
 	chksum_fini();
 	zil_fini();
 	dmu_fini();
 	zio_fini();
 	ddt_fini();
 	brt_fini();
 	metaslab_stat_fini();
 	zfs_btree_fini();
 	unique_fini();
 	zfs_refcount_fini();
 	fm_fini();
 	scan_fini();
 	qat_fini();
 	spa_import_progress_destroy();
 	zap_fini();
 
 	avl_destroy(&spa_namespace_avl);
 	avl_destroy(&spa_spare_avl);
 	avl_destroy(&spa_l2cache_avl);
 
 	cv_destroy(&spa_namespace_cv);
 	mutex_destroy(&spa_namespace_lock);
 	mutex_destroy(&spa_spare_lock);
 	mutex_destroy(&spa_l2cache_lock);
 }
 
 /*
  * Return whether this pool has a dedicated slog device. No locking needed.
  * It's not a problem if the wrong answer is returned as it's only for
  * performance and not correctness.
  */
 boolean_t
 spa_has_slogs(spa_t *spa)
 {
 	return (spa->spa_log_class->mc_groups != 0);
 }
 
 spa_log_state_t
 spa_get_log_state(spa_t *spa)
 {
 	return (spa->spa_log_state);
 }
 
 void
 spa_set_log_state(spa_t *spa, spa_log_state_t state)
 {
 	spa->spa_log_state = state;
 }
 
 boolean_t
 spa_is_root(spa_t *spa)
 {
 	return (spa->spa_is_root);
 }
 
 boolean_t
 spa_writeable(spa_t *spa)
 {
 	return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config);
 }
 
 /*
  * Returns true if there is a pending sync task in any of the current
  * syncing txg, the current quiescing txg, or the current open txg.
  */
 boolean_t
 spa_has_pending_synctask(spa_t *spa)
 {
 	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
 	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
 }
 
 spa_mode_t
 spa_mode(spa_t *spa)
 {
 	return (spa->spa_mode);
 }
 
 uint64_t
 spa_get_last_scrubbed_txg(spa_t *spa)
 {
 	return (spa->spa_scrubbed_last_txg);
 }
 
 uint64_t
 spa_bootfs(spa_t *spa)
 {
 	return (spa->spa_bootfs);
 }
 
 uint64_t
 spa_delegation(spa_t *spa)
 {
 	return (spa->spa_delegation);
 }
 
 objset_t *
 spa_meta_objset(spa_t *spa)
 {
 	return (spa->spa_meta_objset);
 }
 
 enum zio_checksum
 spa_dedup_checksum(spa_t *spa)
 {
 	return (spa->spa_dedup_checksum);
 }
 
 /*
  * Reset pool scan stat per scan pass (or reboot).
  */
 void
 spa_scan_stat_init(spa_t *spa)
 {
 	/* data not stored on disk */
 	spa->spa_scan_pass_start = gethrestime_sec();
 	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
 		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
 	else
 		spa->spa_scan_pass_scrub_pause = 0;
 
 	if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
 		spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
 	else
 		spa->spa_scan_pass_errorscrub_pause = 0;
 
 	spa->spa_scan_pass_scrub_spent_paused = 0;
 	spa->spa_scan_pass_exam = 0;
 	spa->spa_scan_pass_issued = 0;
 
 	// error scrub stats
 	spa->spa_scan_pass_errorscrub_spent_paused = 0;
 }
 
 /*
  * Get scan stats for zpool status reports
  */
 int
 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
 	if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
 	    scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
 		return (SET_ERROR(ENOENT));
 
 	memset(ps, 0, sizeof (pool_scan_stat_t));
 
 	/* data stored on disk */
 	ps->pss_func = scn->scn_phys.scn_func;
 	ps->pss_state = scn->scn_phys.scn_state;
 	ps->pss_start_time = scn->scn_phys.scn_start_time;
 	ps->pss_end_time = scn->scn_phys.scn_end_time;
 	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
 	ps->pss_examined = scn->scn_phys.scn_examined;
 	ps->pss_skipped = scn->scn_phys.scn_skipped;
 	ps->pss_processed = scn->scn_phys.scn_processed;
 	ps->pss_errors = scn->scn_phys.scn_errors;
 
 	/* data not stored on disk */
 	ps->pss_pass_exam = spa->spa_scan_pass_exam;
 	ps->pss_pass_start = spa->spa_scan_pass_start;
 	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
 	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
 	ps->pss_pass_issued = spa->spa_scan_pass_issued;
 	ps->pss_issued =
 	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
 	/* error scrub data stored on disk */
 	ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
 	ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
 	ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
 	ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
 	ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
 	ps->pss_error_scrub_to_be_examined =
 	    scn->errorscrub_phys.dep_to_examine;
 
 	/* error scrub data not stored on disk */
 	ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
 
 	return (0);
 }
 
 int
 spa_maxblocksize(spa_t *spa)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
 		return (SPA_MAXBLOCKSIZE);
 	else
 		return (SPA_OLD_MAXBLOCKSIZE);
 }
 
 
 /*
  * Returns the txg that the last device removal completed. No indirect mappings
  * have been added since this txg.
  */
 uint64_t
 spa_get_last_removal_txg(spa_t *spa)
 {
 	uint64_t vdevid;
 	uint64_t ret = -1ULL;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	/*
 	 * sr_prev_indirect_vdev is only modified while holding all the
 	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
 	 * examining it.
 	 */
 	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
 
 	while (vdevid != -1ULL) {
 		vdev_t *vd = vdev_lookup_top(spa, vdevid);
 		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		/*
 		 * If the removal did not remap any data, we don't care.
 		 */
 		if (vdev_indirect_births_count(vib) != 0) {
 			ret = vdev_indirect_births_last_entry_txg(vib);
 			break;
 		}
 
 		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	IMPLY(ret != -1ULL,
 	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
 
 	return (ret);
 }
 
 int
 spa_maxdnodesize(spa_t *spa)
 {
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
 		return (DNODE_MAX_SIZE);
 	else
 		return (DNODE_MIN_SIZE);
 }
 
 boolean_t
 spa_multihost(spa_t *spa)
 {
 	return (spa->spa_multihost ? B_TRUE : B_FALSE);
 }
 
 uint32_t
 spa_get_hostid(spa_t *spa)
 {
 	return (spa->spa_hostid);
 }
 
 boolean_t
 spa_trust_config(spa_t *spa)
 {
 	return (spa->spa_trust_config);
 }
 
 uint64_t
 spa_missing_tvds_allowed(spa_t *spa)
 {
 	return (spa->spa_missing_tvds_allowed);
 }
 
 space_map_t *
 spa_syncing_log_sm(spa_t *spa)
 {
 	return (spa->spa_syncing_log_sm);
 }
 
 void
 spa_set_missing_tvds(spa_t *spa, uint64_t missing)
 {
 	spa->spa_missing_tvds = missing;
 }
 
 /*
  * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
  */
 const char *
 spa_state_to_name(spa_t *spa)
 {
 	ASSERT3P(spa, !=, NULL);
 
 	/*
 	 * it is possible for the spa to exist, without root vdev
 	 * as the spa transitions during import/export
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	if (rvd == NULL) {
 		return ("TRANSITIONING");
 	}
 	vdev_state_t state = rvd->vdev_state;
 	vdev_aux_t aux = rvd->vdev_stat.vs_aux;
 
 	if (spa_suspended(spa))
 		return ("SUSPENDED");
 
 	switch (state) {
 	case VDEV_STATE_CLOSED:
 	case VDEV_STATE_OFFLINE:
 		return ("OFFLINE");
 	case VDEV_STATE_REMOVED:
 		return ("REMOVED");
 	case VDEV_STATE_CANT_OPEN:
 		if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
 			return ("FAULTED");
 		else if (aux == VDEV_AUX_SPLIT_POOL)
 			return ("SPLIT");
 		else
 			return ("UNAVAIL");
 	case VDEV_STATE_FAULTED:
 		return ("FAULTED");
 	case VDEV_STATE_DEGRADED:
 		return ("DEGRADED");
 	case VDEV_STATE_HEALTHY:
 		return ("ONLINE");
 	default:
 		break;
 	}
 
 	return ("UNKNOWN");
 }
 
 boolean_t
 spa_top_vdevs_spacemap_addressable(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 spa_has_checkpoint(spa_t *spa)
 {
 	return (spa->spa_checkpoint_txg != 0);
 }
 
 boolean_t
 spa_importing_readonly_checkpoint(spa_t *spa)
 {
 	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
 	    spa->spa_mode == SPA_MODE_READ);
 }
 
 uint64_t
 spa_min_claim_txg(spa_t *spa)
 {
 	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
 
 	if (checkpoint_txg != 0)
 		return (checkpoint_txg + 1);
 
 	return (spa->spa_first_txg);
 }
 
 /*
  * If there is a checkpoint, async destroys may consume more space from
  * the pool instead of freeing it. In an attempt to save the pool from
  * getting suspended when it is about to run out of space, we stop
  * processing async destroys.
  */
 boolean_t
 spa_suspend_async_destroy(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	uint64_t unreserved = dsl_pool_unreserved_space(dp,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
 	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
 
 	if (spa_has_checkpoint(spa) && avail == 0)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 #if defined(_KERNEL)
 
 int
 param_set_deadman_failmode_common(const char *val)
 {
 	spa_t *spa = NULL;
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
 	    strcmp(val, "panic"))
 		return (SET_ERROR(EINVAL));
 
 	if (spa_mode_global != SPA_MODE_UNINIT) {
 		mutex_enter(&spa_namespace_lock);
 		while ((spa = spa_next(spa)) != NULL)
 			spa_set_deadman_failmode(spa, val);
 		mutex_exit(&spa_namespace_lock);
 	}
 
 	return (0);
 }
 #endif
 
 /* Namespace manipulation */
 EXPORT_SYMBOL(spa_lookup);
 EXPORT_SYMBOL(spa_add);
 EXPORT_SYMBOL(spa_remove);
 EXPORT_SYMBOL(spa_next);
 
 /* Refcount functions */
 EXPORT_SYMBOL(spa_open_ref);
 EXPORT_SYMBOL(spa_close);
 EXPORT_SYMBOL(spa_refcount_zero);
 
 /* Pool configuration lock */
 EXPORT_SYMBOL(spa_config_tryenter);
 EXPORT_SYMBOL(spa_config_enter);
 EXPORT_SYMBOL(spa_config_exit);
 EXPORT_SYMBOL(spa_config_held);
 
 /* Pool vdev add/remove lock */
 EXPORT_SYMBOL(spa_vdev_enter);
 EXPORT_SYMBOL(spa_vdev_exit);
 
 /* Pool vdev state change lock */
 EXPORT_SYMBOL(spa_vdev_state_enter);
 EXPORT_SYMBOL(spa_vdev_state_exit);
 
 /* Accessor functions */
 EXPORT_SYMBOL(spa_shutting_down);
 EXPORT_SYMBOL(spa_get_dsl);
 EXPORT_SYMBOL(spa_get_rootblkptr);
 EXPORT_SYMBOL(spa_set_rootblkptr);
 EXPORT_SYMBOL(spa_altroot);
 EXPORT_SYMBOL(spa_sync_pass);
 EXPORT_SYMBOL(spa_name);
 EXPORT_SYMBOL(spa_guid);
 EXPORT_SYMBOL(spa_last_synced_txg);
 EXPORT_SYMBOL(spa_first_txg);
 EXPORT_SYMBOL(spa_syncing_txg);
 EXPORT_SYMBOL(spa_version);
 EXPORT_SYMBOL(spa_state);
 EXPORT_SYMBOL(spa_load_state);
 EXPORT_SYMBOL(spa_freeze_txg);
 EXPORT_SYMBOL(spa_get_dspace);
 EXPORT_SYMBOL(spa_update_dspace);
 EXPORT_SYMBOL(spa_deflate);
 EXPORT_SYMBOL(spa_normal_class);
 EXPORT_SYMBOL(spa_log_class);
 EXPORT_SYMBOL(spa_special_class);
 EXPORT_SYMBOL(spa_preferred_class);
 EXPORT_SYMBOL(spa_max_replication);
 EXPORT_SYMBOL(spa_prev_software_version);
 EXPORT_SYMBOL(spa_get_failmode);
 EXPORT_SYMBOL(spa_suspended);
 EXPORT_SYMBOL(spa_bootfs);
 EXPORT_SYMBOL(spa_delegation);
 EXPORT_SYMBOL(spa_meta_objset);
 EXPORT_SYMBOL(spa_maxblocksize);
 EXPORT_SYMBOL(spa_maxdnodesize);
 
 /* Miscellaneous support routines */
 EXPORT_SYMBOL(spa_guid_exists);
 EXPORT_SYMBOL(spa_strdup);
 EXPORT_SYMBOL(spa_strfree);
 EXPORT_SYMBOL(spa_generate_guid);
 EXPORT_SYMBOL(snprintf_blkptr);
 EXPORT_SYMBOL(spa_freeze);
 EXPORT_SYMBOL(spa_upgrade);
 EXPORT_SYMBOL(spa_evict_all);
 EXPORT_SYMBOL(spa_lookup_by_guid);
 EXPORT_SYMBOL(spa_has_spare);
 EXPORT_SYMBOL(dva_get_dsize_sync);
 EXPORT_SYMBOL(bp_get_dsize_sync);
 EXPORT_SYMBOL(bp_get_dsize);
 EXPORT_SYMBOL(spa_has_slogs);
 EXPORT_SYMBOL(spa_is_root);
 EXPORT_SYMBOL(spa_writeable);
 EXPORT_SYMBOL(spa_mode);
 EXPORT_SYMBOL(spa_namespace_lock);
 EXPORT_SYMBOL(spa_trust_config);
 EXPORT_SYMBOL(spa_missing_tvds_allowed);
 EXPORT_SYMBOL(spa_set_missing_tvds);
 EXPORT_SYMBOL(spa_state_to_name);
 EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
 EXPORT_SYMBOL(spa_min_claim_txg);
 EXPORT_SYMBOL(spa_suspend_async_destroy);
 EXPORT_SYMBOL(spa_has_checkpoint);
 EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
 
 ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
 	"Set additional debugging flags");
 
 ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
 	"Set to attempt to recover from fatal errors");
 
 ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
 	"Set to ignore IO errors during free and permanently leak the space");
 
 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW,
 	"Dead I/O check interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW,
 	"Enable deadman timer");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW,
 	"SPA size estimate multiplication factor");
 
 ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
 	"Place DDT data into the special class");
 
 ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
 	"Place user data indirect blocks into the special class");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
 	param_set_deadman_failmode, param_get_charp, ZMOD_RW,
 	"Failmode for deadman timer");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
 	param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW,
 	"Pool sync expiration time in milliseconds");
 
 ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
 	param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW,
 	"IO expiration time in milliseconds");
 
 ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW,
 	"Small file blocks in special vdevs depends on this much "
 	"free space available");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
 	param_get_uint, ZMOD_RW, "Reserved free space in pool");
 
 ZFS_MODULE_PARAM(zfs, spa_, num_allocators, INT, ZMOD_RW,
 	"Number of allocators per spa");
 
 ZFS_MODULE_PARAM(zfs, spa_, cpus_per_allocator, INT, ZMOD_RW,
 	"Minimum number of CPUs per allocators");
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 1a930f1e4f1a..250590f062ea 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1,6585 +1,6581 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2021, Klara Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 #include "zfs_prop.h"
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  * part of the spa_embedded_log_class.  The metaslab with the most free space
  * in each vdev is selected for this purpose when the pool is opened (or a
  * vdev is added).  See vdev_metaslab_init().
  *
  * Log blocks can be allocated from the following locations.  Each one is tried
  * in order until the allocation succeeds:
  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  * 2. embedded slog metaslabs (spa_embedded_log_class)
  * 3. other metaslabs in normal vdevs (spa_normal_class)
  *
  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  * than this number of metaslabs in the vdev.  This ensures that we don't set
  * aside an unreasonable amount of space for the ZIL.  If set to less than
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
 static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
 static unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit deadman "hung IO" events to this many per second.
  */
 static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
  * Rate limit direct write IO verify failures to this many per scond.
  */
 static unsigned int zfs_dio_write_verify_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
 static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
 static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 /*
  * Tunable parameter for debugging or performance analysis. Setting this
  * will cause pool corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 /*
  * Maximum and minimum ashift values that can be automatically set based on
  * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
  * is higher than the maximum value, it is intentionally limited here to not
  * excessively impact pool space efficiency.  Higher ashift values may still
  * be forced by vdev logical ashift or by user via ashift property, but won't
  * be set automatically as a performance optimization.
  */
 uint_t zfs_vdev_max_auto_ashift = 14;
 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 /*
  * VDEV checksum verification for Direct I/O writes. This is neccessary for
  * Linux, because anonymous pages can not be placed under write protection
  * during Direct I/O writes.
  */
 #if !defined(__FreeBSD__)
 uint_t zfs_vdev_direct_write_verify = 1;
 #else
 uint_t zfs_vdev_direct_write_verify = 0;
 #endif
 
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 		    (u_longlong_t)vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *const vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
 	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, *const *opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
  * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
 	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
 		return (vd->vdev_mg);
 }
 
 void
 vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	(void) vd, (void) remain_rs;
 
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
  * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 uint64_t
 vdev_default_min_asize(vdev_t *vd)
 {
 	return (vd->vdev_min_asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
 		    uint64_t));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 /*
  * Get the minimal allocation size for the top-level vdev.
  */
 uint64_t
 vdev_get_min_alloc(vdev_t *vd)
 {
 	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 
 	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 
 	return (min_alloc);
 }
 
 /*
  * Get the parity level for a top-level vdev.
  */
 uint64_t
 vdev_get_nparity(vdev_t *vd)
 {
 	uint64_t nparity = 0;
 
 	if (vd->vdev_ops->vdev_op_nparity != NULL)
 		nparity = vd->vdev_ops->vdev_op_nparity(vd);
 
 	return (nparity);
 }
 
 static int
 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t objid;
 	int err;
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (EINVAL);
 	}
 
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
 
 	if (err == ENOENT)
 		*value = vdev_prop_default_numeric(prop);
 
 	return (err);
 }
 
 /*
  * Get the number of data disks for a top-level vdev.
  */
 uint64_t
 vdev_get_ndisks(vdev_t *vd)
 {
 	uint64_t ndisks = 1;
 
 	if (vd->vdev_ops->vdev_op_ndisks != NULL)
 		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 
 	return (ndisks);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	int rc;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (rc);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		memcpy(newchild, pvd->vdev_child, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (oldc == 0)
 		return;
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	if (newc > 0) {
 		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 		for (int c = newc = 0; c < oldc; c++) {
 			if ((cvd = pvd->vdev_child[c]) != NULL) {
 				newchild[newc] = cvd;
 				cvd->vdev_id = newc++;
 			}
 		}
 	} else {
 		newchild = NULL;
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_load_guid();
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
 	    0, 0);
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
 	    &zfs_dio_write_verify_events_per_second, 1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
 	/*
 	 * Default Thresholds for tuning ZED
 	 */
 	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
 		    0);
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	const char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	const char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		const char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
 		 * classes input.
 		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 
 		/* spa_vdev_add() expects feature to be enabled */
 		if (ops == &vdev_draid_ops &&
 		    spa->spa_load_state != SPA_LOAD_CREATE &&
 		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	/*
 	 * Initialize the vdev specific data.  This is done before calling
 	 * vdev_alloc_common() since it may fail and this simplifies the
 	 * error reporting and cleanup code paths.
 	 */
 	void *tsd = NULL;
 	if (ops->vdev_op_init != NULL) {
 		rc = ops->vdev_op_init(spa, nv, &tsd);
 		if (rc != 0) {
 			return (rc);
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
 
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
 		vd->vdev_path = spa_strdup(tmp);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 	 * fault on a vdev and want it to persist across imports (like with
 	 * zpool offline -f).
 	 */
 	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_faulted = 1;
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
 		vd->vdev_devid = spa_strdup(tmp);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
 		vd->vdev_physpath = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &tmp) == 0)
 		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
 		vd->vdev_fru = spa_strdup(tmp);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement. Ignore pool ashift for vdev
 	 * attach case.
 	 */
 	if (alloctype != VDEV_ALLOC_ATTACH) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
 		    &vd->vdev_ashift);
 	} else {
 		vd->vdev_attaching = B_TRUE;
 	}
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	if (vd->vdev_ops == &vdev_root_ops &&
 	    (alloctype == VDEV_ALLOC_LOAD ||
 	    alloctype == VDEV_ALLOC_SPLIT ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 		    &vd->vdev_root_zap);
 	}
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 		    &vd->vdev_noalloc);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 		vd->vdev_rz_expanding = nvlist_exists(nv,
 		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 		    &vd->vdev_rebuild_txg);
 
 		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 			vdev_defer_resilver(vd);
 
 		/*
 		 * In general, when importing a pool we want to ignore the
 		 * persistent fault state, as the diagnosis made on another
 		 * system may not be valid in the current context.  The only
 		 * exception is if we forced a vdev to a persistently faulted
 		 * state with 'zpool offline -f'.  The persistent fault will
 		 * remain across imports until cleared.
 		 *
 		 * Local vdevs will remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				const char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 				else
 					vd->vdev_faulted = 0ULL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	if (vd->vdev_ops->vdev_op_fini != NULL)
 		vd->vdev_ops->vdev_op_fini(vd);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path)
 		spa_strfree(vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	mutex_destroy(&vd->vdev_trim_lock);
 	mutex_destroy(&vd->vdev_autotrim_lock);
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
 	cv_destroy(&vd->vdev_autotrim_kick_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
 	zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	if (tvd->vdev_log_mg)
 		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 	if (tvd->vdev_log_mg != NULL)
 		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
 	ASSERT0(tvd->vdev_noalloc);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
 	tvd->vdev_noalloc = svd->vdev_noalloc;
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_noalloc = 0;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If pool not set for autoexpand, we need to also preserve
 		 * mvd's asize to prevent automatic expansion of cvd.
 		 * Otherwise if we are adjusting the mirror by attaching and
 		 * detaching children of non-uniform sizes, the mirror could
 		 * autoexpand, unexpectedly requiring larger devices to
 		 * re-establish the mirror.
 		 */
 		if (!cvd->vdev_spa->spa_autoexpand)
 			cvd->vdev_asize = mvd->vdev_asize;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 /*
  * Choose GCD for spa_gcd_alloc.
  */
 static uint64_t
 vdev_gcd(uint64_t a, uint64_t b)
 {
 	while (b != 0) {
 		uint64_t t = b;
 		b = a % b;
 		a = t;
 	}
 	return (a);
 }
 
 /*
  * Set spa_min_alloc and spa_gcd_alloc.
  */
 static void
 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
 	if (spa->spa_gcd_alloc == INT_MAX) {
 		spa->spa_gcd_alloc = min_alloc;
 	} else {
 		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
 		    spa->spa_gcd_alloc);
 	}
 }
 
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd,
 		    spa->spa_alloc_count);
 
 		if (!vd->vdev_islog) {
 			vd->vdev_log_mg = metaslab_group_create(
 			    spa_embedded_log_class(spa), vd, 1);
 		}
 
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundary
 		 * setting had to wait until now.
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
 			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(spa->spa_meta_objset,
 			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
 	 * group.
 	 */
 	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
 		uint64_t smallest = UINT64_MAX;
 
 		/*
 		 * Note, we only search the new metaslabs, because the old
 		 * (pre-existing) ones may be active (e.g. have non-empty
 		 * range_tree's), and we don't move them to the new
 		 * metaslab_t.
 		 */
 		for (uint64_t m = oldc; m < newc; m++) {
 			uint64_t alloc =
 			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
 			if (alloc < smallest) {
 				slog_msid = m;
 				smallest = alloc;
 			}
 		}
 		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
 		/*
 		 * The metaslab was marked as dirty at the end of
 		 * metaslab_init(). Remove it from the dirty list so that we
 		 * can uninitialize and reinitialize it to the new class.
 		 */
 		if (txg != 0) {
 			(void) txg_list_remove_this(&vd->vdev_ms_list,
 			    slog_ms, txg);
 		}
 		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
 		metaslab_fini(slog_ms);
 		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
 		    &vd->vdev_ms[slog_msid]));
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is marked as non-allocating then don't
 	 * activate the metaslabs since we want to ensure that
 	 * no allocations are performed on this device.
 	 */
 	if (vd->vdev_noalloc) {
 		/* track non-allocating vdev space */
 		spa->spa_nonallocating_dspace += spa_deflate(spa) ?
 		    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 	} else if (!expanding) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_passivate(mg);
 		if (vd->vdev_log_mg != NULL) {
 			ASSERT(!vd->vdev_islog);
 			metaslab_group_passivate(vd->vdev_log_mg);
 		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 		vd->vdev_ms_count = 0;
 
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
 			if (vd->vdev_log_mg != NULL)
 				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 		zio_link_t *zl;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
 		    vd->vdev_cant_read, vd->vdev_cant_write);
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 
 			/*
 			 * If this probe was initiated from zio pipeline, then
 			 * change the state in a spa_async_request. Probes that
 			 * were initiated from a vdev_open can change the state
 			 * as part of the open call.
 			 */
 			if (vps->vps_zio_done_probe) {
 				vd->vdev_fault_wanted = B_TRUE;
 				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
 			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
 		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_load_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_load_error = vdev_load(vd);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 static boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 #ifdef _KERNEL
 	if (zvol_is_zvol(vd->vdev_path))
 		return (B_TRUE);
 #endif
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Returns B_TRUE if the passed child should be opened.
  */
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
 	(void) vd;
 	return (B_TRUE);
 }
 
 /*
  * Open the requested child vdevs.  If any of the leaf vdevs are using
  * a ZFS volume then do the opens in a single thread.  This avoids a
  * deadlock when the current thread is holding the spa_namespace_lock.
  */
 static void
 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	int children = vd->vdev_children;
 
 	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 	vd->vdev_nonrot = B_TRUE;
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE)
 			continue;
 
 		if (tq == NULL || vdev_uses_zvols(vd)) {
 			cvd->vdev_open_error = vdev_open(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_open_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 }
 
 /*
  * Open all child vdevs.
  */
 void
 vdev_open_children(vdev_t *vd)
 {
 	vdev_open_children_impl(vd, vdev_default_open_children_func);
 }
 
 /*
  * Conditionally open a subset of child vdevs.
  */
 void
 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	vdev_open_children_impl(vd, open_func);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
  * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
  * changed, this algorithm can not change, otherwise it would inconsistently
  * account for existing bp's.  We also hard-code txg 0 for the same reason
  * since expanded RAIDZ vdevs can use a different asize for different birth
  * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
 		    SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Choose the best of two ashifts, preferring one between logical ashift
  * (absolute minimum) and administrator defined maximum, otherwise take
  * the biggest of the two.
  */
 uint64_t
 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
 {
 	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
 		if (b <= logical || b > zfs_vdev_max_auto_ashift)
 			return (a);
 		else
 			return (MAX(a, b));
 	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
 		return (MAX(a, b));
 	return (b);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 static void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
 	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
 	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_physical_ashift));
 	} else {
 		/*
 		 * If the logical and physical ashifts are the same, then
 		 * we ensure that the top-level vdev's ashift is not smaller
 		 * than our minimum ashift value. For the unusual case
 		 * where logical ashift > physical ashift, we can't cap
 		 * the calculated ashift based on max ashift as that
 		 * would cause failures.
 		 * We still check if we need to increase it to match
 		 * the min ashift.
 		 */
 		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_ashift);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_fault_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/* Keep the device in removed state if unplugged */
 	if (error == ENOENT && vd->vdev_removed) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
 		    VDEV_AUX_NONE);
 		return (error);
 	}
 
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
 	 * or damaged: either way it's not safe for use, bail out of the open.
 	 */
 	if (osize > max_osize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_OPEN_FAILED);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
 	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	/*
 	 * If the vdev was expanded, record this so that we can re-create the
 	 * uberblock rings in labels {2,3}, during the next sync.
 	 */
 	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
 		vd->vdev_copy_uberblocks = B_TRUE;
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * We can always set the logical/physical ashift members since
 	 * their values are only used to calculate the vdev_ashift when
 	 * the device is first added to the config. These values should
 	 * not be used for anything else since they may change whenever
 	 * the device is reopened and we don't store them in the label.
 	 */
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift,
 	    vd->vdev_logical_ashift);
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For compatibility, a different ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 
 		/*
 		 * If the vdev_ashift was not overridden at creation time
 		 * (0) or the override value is impossible for the device,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
 		if (vd->vdev_ashift < vd->vdev_logical_ashift) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
 				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 				    VDEV_AUX_ASHIFT_TOO_BIG);
 				return (SET_ERROR(EDOM));
 			}
 
 			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
 			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_ASHIFT);
 			return (SET_ERROR(EDOM));
 		}
 	} else {
 		/*
 		 * Make sure the alignment required hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			(void) zfs_ereport_post(
 			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
 			    spa, vd, NULL, NULL, 0);
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (SET_ERROR(EDOM));
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the minimum allocation size.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
 		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
 	 * this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
 		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
 
 	return (0);
 }
 
 static void
 vdev_validate_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_validate_thread = curthread;
 	vd->vdev_validate_error = vdev_validate(vd);
 	vd->vdev_validate_thread = NULL;
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	taskq_t *tq = NULL;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 	int children = vd->vdev_children;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	if (children > 0) {
 		tq = taskq_create("vdev_validate", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			vdev_validate_child(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 	for (int c = 0; c < children; c++) {
 		int error = vd->vdev_child[c]->vdev_validate_error;
 
 		if (error != 0)
 			return (SET_ERROR(EBADF));
 	}
 
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
 	if (svd != NULL && *dvd != NULL) {
 		if (strcmp(svd, *dvd) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
 			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
 			    *dvd, svd);
 			spa_strfree(*dvd);
 			*dvd = spa_strdup(svd);
 		}
 	} else if (svd != NULL) {
 		*dvd = spa_strdup(svd);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)guid, *dvd);
 	}
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	char *old, *new;
 
 	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_physpath", svd->vdev_physpath,
 	    &dvd->vdev_physpath, dvd->vdev_guid);
 
 	/*
 	 * Our enclosure sysfs path may have changed between imports
 	 */
 	old = dvd->vdev_enc_sysfs_path;
 	new = svd->vdev_enc_sysfs_path;
 	if ((old != NULL && new == NULL) ||
 	    (old == NULL && new != NULL) ||
 	    ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
 		    "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 		    old, new);
 
 		if (dvd->vdev_enc_sysfs_path)
 			spa_strfree(dvd->vdev_enc_sysfs_path);
 
 		if (svd->vdev_enc_sysfs_path) {
 			dvd->vdev_enc_sysfs_path = spa_strdup(
 			    svd->vdev_enc_sysfs_path);
 		} else {
 			dvd->vdev_enc_sysfs_path = NULL;
 		}
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	ASSERT(vd != NULL);
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	ASSERT(spa_is_root(vd->vdev_spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache) {
 			/*
 			 * In case the vdev is present we should evict all ARC
 			 * buffers and pointers to log blocks and reclaim their
 			 * space before restoring its contents to L2ARC.
 			 */
 			if (l2arc_vdev_present(vd)) {
 				l2arc_rebuild_vdev(vd, B_TRUE);
 			} else {
 				l2arc_add_vdev(spa, vd);
 			}
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Recheck if resilver is still needed and cancel any
 	 * scheduled resilver if resilver is unneeded.
 	 */
 	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
 	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
 		mutex_enter(&spa->spa_async_lock);
 		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
 		mutex_exit(&spa->spa_async_lock);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size       metaslab count
 	 *  --------------|-----------------
 	 *      < 8GB        ~16
 	 *  8GB   - 100GB   one per 512MB
 	 *  100GB - 3TB     ~200
 	 *  3TB   - 2PB     one per 16GB
 	 *      > 2PB       ~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_contains(rt, txg, size))
 		range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * This isn't a problem but it can result in devices being tried
 	 * which are known to not have the data.  In which case, the import
 	 * is relying on the checksum to ensure that we get the right data.
 	 * Note that while importing we are only reading the MOS, which is
 	 * always checksummed.
 	 */
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_is_empty(rt))
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Check if the txg falls within the range which must be
  * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	(void) dva, (void) psize;
 
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
 
 	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
 }
 
 /*
  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
 	    phys_birth));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_deferred)
 		return (B_FALSE);
 
 	if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	if (rebuild_done) {
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		/* Rebuild not initiated by attach */
 		if (vd->vdev_rebuild_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a rebuild completes without error then all missing data
 		 * up to the rebuild max txg has been reconstructed and the DTL
 		 * is eligible for excision.
 		 */
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
 		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
 			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
 			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
 			return (B_TRUE);
 		}
 	} else {
 		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
 
 		/* Resilver not initiated by attach */
 		if (vd->vdev_resilver_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a resilver is initiated the scan will assign the
 		 * scn_max_txg value to the highest txg value that exists
 		 * in all DTLs. If this device's max DTL is not part of this
 		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
 		 * then it is not eligible for excision.
 		 */
 		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
 			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
  * write operations will be issued to the pool.
  */
 static void
 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done, rebuild_done, faulting);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		boolean_t check_excise = B_FALSE;
 		boolean_t wasempty = B_TRUE;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If requested, pretend the scan or rebuild completed cleanly.
 		 */
 		if (zfs_scan_ignore_errors) {
 			if (scn != NULL)
 				scn->scn_phys.scn_errors = 0;
 			if (vr != NULL)
 				vr->vr_rebuild_phys.vrp_errors = 0;
 		}
 
 		if (scrub_txg != 0 &&
 		    !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 			wasempty = B_FALSE;
 			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
 			    "dtl:%llu/%llu errors:%llu",
 			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
 			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
 			    (u_longlong_t)vdev_dtl_min(vd),
 			    (u_longlong_t)vdev_dtl_max(vd),
 			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
 		}
 
 		/*
 		 * If we've completed a scrub/resilver or a rebuild cleanly
 		 * then determine if this vdev should remove any DTLs. We
 		 * only want to excise regions on vdevs that were available
 		 * during the entire duration of this scan.
 		 */
 		if (rebuild_done &&
 		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
 			check_excise = B_TRUE;
 		} else {
 			if (spa->spa_scrub_started ||
 			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
 				check_excise = B_TRUE;
 			}
 		}
 
 		if (scrub_txg && check_excise &&
 		    vdev_dtl_should_excise(vd, rebuild_done)) {
 			/*
 			 * We completed a scrub, resilver or rebuild up to
 			 * scrub_txg.  If we did it without rebooting, then
 			 * the scrub dtl will be valid, so excise the old
 			 * region and fold in the scrub dtl.  Otherwise,
 			 * leave the dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 
 			if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
 				    (u_longlong_t)vdev_dtl_min(vd),
 				    (u_longlong_t)vdev_dtl_max(vd));
 			} else if (!wasempty) {
 				zfs_dbgmsg("DTL_MISSING is now empty");
 			}
 		}
 		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
 		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 
 		/*
 		 * For the faulting case, treat members of a replacing vdev
 		 * as if they are not available. It's more likely than not that
 		 * a vdev in a replacing vdev could encounter read errors so
 		 * treat it as not being able to contribute.
 		 */
 		if (!vdev_readable(vd) ||
 		    (faulting && vd->vdev_parent != NULL &&
 		    vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
 			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		} else {
 			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 		}
 
 		/*
 		 * If the vdev was resilvering or rebuilding and no longer
 		 * has any DTLs then reset the appropriate flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (txg != 0 &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			if (vd->vdev_rebuild_txg != 0) {
 				vd->vdev_rebuild_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			} else if (vd->vdev_resilver_txg != 0) {
 				vd->vdev_resilver_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			}
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 	} else {
 		mutex_enter(&vd->vdev_dtl_lock);
 		for (int t = 0; t < DTL_TYPES; t++) {
 			/* account for child's outage in parent's missing map */
 			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 			if (t == DTL_SCRUB) {
 				/* leaf vdevs only */
 				continue;
 			}
 			if (t == DTL_PARTIAL) {
 				/* i.e. non-zero */
 				minref = 1;
 			} else if (vdev_get_nparity(vd) != 0) {
 				/* RAIDZ, DRAID */
 				minref = vdev_get_nparity(vd) + 1;
 			} else {
 				/* any kind of mirror */
 				minref = vd->vdev_children;
 			}
 			space_reftree_create(&reftree);
 			for (int c = 0; c < vd->vdev_children; c++) {
 				vdev_t *cvd = vd->vdev_child[c];
 				mutex_enter(&cvd->vdev_dtl_lock);
 				space_reftree_add_map(&reftree,
 				    cvd->vdev_dtl[s], 1);
 				mutex_exit(&cvd->vdev_dtl_lock);
 			}
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[t], minref);
 			space_reftree_destroy(&reftree);
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
 
 	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
 		raidz_dtl_reassessed(vd);
 	}
 }
 
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done)
 {
 	return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
 	    rebuild_done, B_FALSE));
 }
 
 /*
  * Iterate over all the vdevs except spare, and post kobj events
  */
 void
 vdev_post_kobj_evt(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
 	    vd->vdev_kobj_flag == B_FALSE) {
 		vd->vdev_kobj_flag = B_TRUE;
 		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_post_kobj_evt(vd->vdev_child[c]);
 }
 
 /*
  * Iterate over all the vdevs except spare, and clear kobj events
  */
 void
 vdev_clear_kobj_evt(vdev_t *vd)
 {
 	vd->vdev_kobj_flag = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rt;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		/*
 		 * If the dtl cannot be sync'd there is no need to open it.
 		 */
 		if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
 			return (0);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
 			range_tree_walk(rt, range_tree_add,
 			    vd->vdev_dtl[DTL_MISSING]);
 			mutex_exit(&vd->vdev_dtl_lock);
 		}
 
 		range_tree_vacate(rt, NULL, NULL);
 		range_tree_destroy(rt);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
 	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
 			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
 		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 static void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be
  * - offlined
  * - detached
  * - removed
  * - faulted
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 	boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 
 	if (!required && zio_injection_enabled) {
 		required = !!zio_handle_device_injection(vd, NULL,
 		    SET_ERROR(ECHILD));
 	}
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
  * will contain either the checkpoint spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int children = vd->vdev_children;
 	int error = 0;
 	taskq_t *tq = NULL;
 
 	/*
 	 * It's only worthwhile to use the taskq for the root vdev, because the
 	 * slow part is metaslab_init, and that only happens for top-level
 	 * vdevs.
 	 */
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
 		tq = taskq_create("vdev_load", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			cvd->vdev_load_error = vdev_load(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_load_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int error = vd->vdev_child[c]->vdev_load_error;
 
 		if (error != 0)
 			return (error);
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	if (vd->vdev_ops == &vdev_raidz_ops) {
 		error = vdev_raidz_load(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str);
 		if (error == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		} else if (error != ENOENT) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 			return (error);
 		}
 	}
 
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		uint64_t failfast;
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
 		    1, &failfast);
 		if (error == 0) {
 			vd->vdev_failfast = failfast & 1;
 		} else if (error == ENOENT) {
 			vd->vdev_failfast = vdev_prop_default_numeric(
 			    VDEV_PROP_FAILFAST);
 		} else {
 			vdev_dbgmsg(vd,
 			    "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 		}
 	}
 
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		error = vdev_rebuild_load(vd);
 		if (error && error != ENOTSUP) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
 			    "failed [error=%d]", error);
 			return (error);
 		}
 	}
 
 	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
 		uint64_t zapobj;
 
 		if (vd->vdev_top_zap != 0)
 			zapobj = vd->vdev_top_zap;
 		else
 			zapobj = vd->vdev_leaf_zap;
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
 		    &vd->vdev_checksum_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
 		    &vd->vdev_checksum_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
 		    &vd->vdev_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
 		    &vd->vdev_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
 		    &vd->vdev_slow_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
 		    &vd->vdev_slow_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj;
 		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
 		if (error == 0 && checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		} else if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
 			    "checkpoint space map object from vdev ZAP "
 			    "[error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object;
 	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
 	if (error == 0 && obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	} else if (error != 0) {
 		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
 		    "space map object from vdev ZAP [error=%d]", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 static void
 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(vd->vdev_spa);
 
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
 	if (err == ENOENT)
 		return;
 	VERIFY0(err);
 
 	VERIFY0(dmu_object_free(mos, object, tx));
 	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_sync_reassess(vd->vdev_log_mg);
 	}
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
 
 /*
  * Return the amount of space that should be (or was) allocated for the given
  * psize (compressed block size) in the given TXG. Note that for expanded
  * RAIDZ vdevs, the size allocated for older BP's may be larger. See
  * vdev_raidz_asize().
  */
 uint64_t
 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * If user did a 'zpool offline -f' then make the fault persist across
 	 * reboots.
 	 */
 	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
 		/*
 		 * There are two kinds of forced faults: temporary and
 		 * persistent.  Temporary faults go away at pool import, while
 		 * persistent faults stay set.  Both types of faults can be
 		 * cleared with a zpool clear.
 		 *
 		 * We tell if a vdev is persistently faulted by looking at the
 		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
 		 * import then it's a persistent fault.  Otherwise, it's
 		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
 		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
 		 * tells vdev_config_generate() (which gets run later) to set
 		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
 		 */
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_tmpoffline = B_FALSE;
 		aux = VDEV_AUX_EXTERNAL;
 	} else {
 		vd->vdev_tmpoffline = B_TRUE;
 	}
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_remove_wanted(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	/*
 	 * If the vdev is already removed, or expanding which can trigger
 	 * repartition add/remove events, then don't do anything.
 	 */
 	if (vd->vdev_removed || vd->vdev_expanding)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	/*
 	 * Confirm the vdev has been removed, otherwise don't do anything.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
 
 	vd->vdev_remove_wanted = B_TRUE;
 	spa_async_request(spa, SPA_ASYNC_REMOVE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
 			    spa->spa_autoexpand);
 		vd->vdev_expansion_time = gethrestime_sec();
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	/*
 	 * Restart trimming if necessary. We do not restart trimming for cache
 	 * devices here. This is triggered by l2arc_rebuild_vdev()
 	 * asynchronously for the whole device or in l2arc_evict() as it evicts
 	 * space for upcoming writes.
 	 */
 	mutex_enter(&vd->vdev_trim_lock);
 	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
 	    vd->vdev_trim_thread == NULL &&
 	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 		/*
 		 * Asynchronously detach spare vdev if resilver or
 		 * rebuild is not required
 		 */
 		if (vd->vdev_unspare &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
 		    !vdev_rebuild_active(tvd))
 			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
 	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				ASSERT3U(space_map_allocated(
 				    tvd->vdev_checkpoint_sm), !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect  or removed vdev.
 	 */
 	if (!vdev_is_concrete(vd) || vd->vdev_removed)
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 		/*
 		 * When reopening in response to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 		vd->vdev_stat.vs_aux = 0;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		/* If a resilver isn't required, check if vdevs can be culled */
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	/* Clear recent error events cache (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
 	/*
 	 * Exclude the dRAID spare when aggregating to avoid double counting
 	 * the ops and bytes.  These IOs are counted by the physical leaves.
 	 */
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
 
 	cvs->vs_scan_removing = cvd->vdev_removing;
 }
 
 /*
  * Get extended stats
  */
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
 	(void) cvd;
 
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
 			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
 			vsx->vsx_total_histo[t][b] +=
 			    cvsx->vsx_total_histo[t][b];
 		}
 	}
 
 	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
 			vsx->vsx_queue_histo[t][b] +=
 			    cvsx->vsx_queue_histo[t][b];
 		}
 		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
 		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
 			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
 			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
 	}
 
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 static void
 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	int t;
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		if (vs) {
 			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
 			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
 		}
 		if (vsx)
 			memset(vsx, 0, sizeof (*vsx));
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
 
 			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
 			if (vs)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
 		}
 	} else {
 		/*
 		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
 		 * other leaf stats are updated in vdev_stat_update().
 		 */
 		if (!vsx)
 			return;
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
 		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
 			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
 
 void
 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
 		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
 			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
 			vs->vs_initialize_bytes_done =
 			    vd->vdev_initialize_bytes_done;
 			vs->vs_initialize_bytes_est =
 			    vd->vdev_initialize_bytes_est;
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
 
 			/*
 			 * Report manual TRIM progress. Since we don't have
 			 * the manual TRIM locks held, this is only an
 			 * estimate (although fairly accurate one).
 			 */
 			vs->vs_trim_notsup = !vd->vdev_has_trim;
 			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
 			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
 			vs->vs_trim_state = vd->vdev_trim_state;
 			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 
 			/* Set when there is a deferred resilver. */
 			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 		}
 
 		/*
 		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
 			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
 			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
 		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
 			vs->vs_physical_ashift = vd->vdev_physical_ashift;
 		else
 			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
 		 * non-auxiliary, concrete devices.
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
 			/*
 			 * The vdev fragmentation rating doesn't take into
 			 * account the embedded slog metaslab (vdev_log_mg).
 			 * Since it's only one metaslab, it would have a tiny
 			 * impact on the overall fragmentation.
 			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
 		vs->vs_noalloc = MAX(vd->vdev_noalloc,
 		    tvd ? tvd->vdev_noalloc : 0);
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	return (vdev_get_stats_ex(vd, vs, NULL));
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 /* Suppress ASAN false positive */
 #ifdef __SANITIZE_ADDRESS__
 	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
 	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
 #else
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
 #endif
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			/*
 			 * Repair is the result of a resilver issued by the
 			 * scan thread (spa_sync).
 			 */
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			/*
 			 * Repair is the result of a rebuild issued by the
 			 * rebuild thread (vdev_rebuild_thread).  To avoid
 			 * double counting repaired bytes the virtual dRAID
 			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
 				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
 				if (vd->vdev_ops->vdev_op_leaf &&
 				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
 				}
 				vs->vs_rebuild_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		/*
 		 * The bytes/ops/histograms are recorded at the leaf level and
 		 * aggregated into the higher level vdevs in vdev_get_stats().
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
 			zio_type_t vs_type = type;
 			zio_priority_t priority = zio->io_priority;
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
 			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
 				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
 			 * reporting use the priority to categorize the IO.
 			 * Only the following are reported to user space:
 			 *
 			 *   ZIO_PRIORITY_SYNC_READ,
 			 *   ZIO_PRIORITY_SYNC_WRITE,
 			 *   ZIO_PRIORITY_ASYNC_READ,
 			 *   ZIO_PRIORITY_ASYNC_WRITE,
 			 *   ZIO_PRIORITY_SCRUB,
 			 *   ZIO_PRIORITY_TRIM,
 			 *   ZIO_PRIORITY_REBUILD.
 			 */
 			if (priority == ZIO_PRIORITY_INITIALIZING) {
 				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
 				priority = ZIO_PRIORITY_ASYNC_WRITE;
 			} else if (priority == ZIO_PRIORITY_REMOVAL) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			vs->vs_ops[vs_type]++;
 			vs->vs_bytes[vs_type] += psize;
 
 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			} else {
 				vsx->vsx_ind_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			}
 
 			if (zio->io_delta && zio->io_delay) {
 				vsx->vsx_queue_histo[priority]
 				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
 				vsx->vsx_disk_histo[type]
 				    [L_HISTO(zio->io_delay)]++;
 				vsx->vsx_total_histo[type]
 				    [L_HISTO(zio->io_delta)]++;
 			}
 		}
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	(void) defer_delta;
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * children's, thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	/* ensure we won't underflow */
 	if (alloc_delta < 0) {
 		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
 	}
 
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		ASSERT(!vd->vdev_isl2cache);
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		/*
 		 * Since vdev_offline() code path is already in an offline
 		 * state we can miss a statechange event to OFFLINE. Check
 		 * the previous state to catch this condition.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (state == VDEV_STATE_OFFLINE) &&
 		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
 			/* post an offline state change */
 			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
 		}
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			case VDEV_AUX_BAD_ASHIFT:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
 			    save_state);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
 	 */
 	if (vd->vdev_ops->vdev_op_leaf) {
 		/* preserve original state from a vdev_reopen() */
 		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
 		    (vd->vdev_prevstate != vd->vdev_state) &&
 		    (save_state <= VDEV_STATE_CLOSED))
 			save_state = vd->vdev_prevstate;
 
 		/* filter out state change due to initial vdev_open */
 		if (save_state > VDEV_STATE_CLOSED)
 			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
 			return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
 	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
 	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	VERIFY3U(pvd->vdev_children, >, 1);
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	ASSERT3P(pvd->vdev_child, !=, NULL);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd, const char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd, tag);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			zfs_dbgmsg("slow vdev: %s has %u active IOs",
 			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
 
 void
 vdev_defer_resilver(vdev_t *vd)
 {
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vd->vdev_resilver_deferred = B_TRUE;
 	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
 }
 
 /*
  * Clears the resilver deferred flag on all leaf devs under vd. Returns
  * B_TRUE if we have devices that need to be resilvered and are available to
  * accept resilver I/Os.
  */
 boolean_t
 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 {
 	boolean_t resilver_needed = B_FALSE;
 	spa_t *spa = vd->vdev_spa;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
 	}
 
 	if (vd == spa->spa_root_vdev &&
 	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
 		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 		vdev_config_dirty(vd);
 		spa->spa_resilver_deferred = B_FALSE;
 		return (resilver_needed);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (resilver_needed);
 
 	vd->vdev_resilver_deferred = B_FALSE;
 
 	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
 	    vdev_resilver_needed(vd, NULL, NULL));
 }
 
 boolean_t
 vdev_xlate_is_empty(range_seg64_t *rs)
 {
 	return (rs->rs_start == rs->rs_end);
 }
 
 /*
  * Translate a logical range to the first contiguous physical range for the
  * specified vdev_t.  This function is initially called with a leaf vdev and
  * will walk each parent vdev until it reaches a top-level vdev. Once the
  * top-level is reached the physical range is initialized and the recursive
  * function begins to unwind. As it unwinds it calls the parent's vdev
  * specific translation function to do the real conversion.
  */
 void
 vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
 		    remain_rs);
 	} else {
 		/*
 		 * We've reached the top-level vdev, initialize the physical
 		 * range to the logical range and set an empty remaining
 		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
 
 		remain_rs->rs_start = logical_rs->rs_start;
 		remain_rs->rs_end = logical_rs->rs_start;
 
 		return;
 	}
 
 	vdev_t *pvd = vd->vdev_parent;
 	ASSERT3P(pvd, !=, NULL);
 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 
 	/*
 	 * As this recursive function unwinds, translate the logical
 	 * range into its physical and any remaining components by calling
 	 * the vdev specific translate function.
 	 */
 	range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }
 
 void
 vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg)
 {
 	range_seg64_t iter_rs = *logical_rs;
 	range_seg64_t physical_rs;
 	range_seg64_t remain_rs;
 
 	while (!vdev_xlate_is_empty(&iter_rs)) {
 
 		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
 
 		/*
 		 * With raidz and dRAID, it's possible that the logical range
 		 * does not live on this leaf vdev. Only when there is a non-
 		 * zero physical size call the provided function.
 		 */
 		if (!vdev_xlate_is_empty(&physical_rs))
 			func(arg, &physical_rs);
 
 		iter_rs = remain_rs;
 	}
 }
 
 static char *
 vdev_name(vdev_t *vd, char *buf, int buflen)
 {
 	if (vd->vdev_path == NULL) {
 		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
 			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
 		} else if (!vd->vdev_ops->vdev_op_leaf) {
 			snprintf(buf, buflen, "%s-%llu",
 			    vd->vdev_ops->vdev_op_type,
 			    (u_longlong_t)vd->vdev_id);
 		}
 	} else {
 		strlcpy(buf, vd->vdev_path, buflen);
 	}
 	return (buf);
 }
 
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
  */
 boolean_t
 vdev_replace_in_progress(vdev_t *vdev)
 {
 	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev->vdev_ops == &vdev_replacing_ops)
 		return (B_TRUE);
 
 	/*
 	 * A 'spare' vdev indicates that we have a replace in progress, unless
 	 * it has exactly two children, and the second, the hot spare, has
 	 * finished being resilvered.
 	 */
 	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
 	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
 		return (B_TRUE);
 
 	for (int i = 0; i < vdev->vdev_children; i++) {
 		if (vdev_replace_in_progress(vdev->vdev_child[i]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static void
 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *vd;
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
 	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
 	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
 
 	/* this vdev could get removed while waiting for this sync task */
 	if (vd == NULL)
 		return;
 
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		panic("unexpected vdev type");
 	}
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
 				strval = fnvpair_value_string(elem);
 				if (strlen(strval) == 0) {
 					/* remove the property if value == "" */
 					(void) zap_remove(mos, objid, propname,
 					    tx);
 				} else {
 					VERIFY0(zap_update(mos, objid, propname,
 					    1, strlen(strval) + 1, strval, tx));
 				}
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			}
 			break;
 		default:
 			/* normalize the property name */
 			propname = vdev_prop_to_name(prop);
 			proptype = vdev_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos, objid, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(vdev_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos, objid, propname,
 				    sizeof (uint64_t), 1, &intval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%lld",
 				    (u_longlong_t)vdev_guid,
 				    nvpair_name(elem), (longlong_t)intval);
 			} else {
 				panic("invalid vdev property type %u",
 				    nvpair_type(elem));
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 int
 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	nvlist_t *nvprops;
 	int error = 0;
 
 	ASSERT(vd != NULL);
 
 	/* Check that vdev has a zap we can use */
 	if (vd->vdev_root_zap == 0 &&
 	    vd->vdev_top_zap == 0 &&
 	    vd->vdev_leaf_zap == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
 	    &nvprops) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 		vdev_prop_t prop = vdev_name_to_prop(propname);
 		uint64_t intval = 0;
 		const char *strval = NULL;
 
 		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
 			error = EINVAL;
 			goto end;
 		}
 
 		if (vdev_prop_readonly(prop)) {
 			error = EROFS;
 			goto end;
 		}
 
 		/* Special Processing */
 		switch (prop) {
 		case VDEV_PROP_PATH:
 			if (vd->vdev_path == NULL) {
 				error = EROFS;
 				break;
 			}
 			if (nvpair_value_string(elem, &strval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			/* New path must start with /dev/ */
 			if (strncmp(strval, "/dev/", 5)) {
 				error = EINVAL;
 				break;
 			}
 			error = spa_vdev_setpath(spa, vdev_guid, strval);
 			break;
 		case VDEV_PROP_ALLOCATING:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			if (intval != vd->vdev_noalloc)
 				break;
 			if (intval == 0)
 				error = spa_vdev_noalloc(spa, vdev_guid);
 			else
 				error = spa_vdev_alloc(spa, vdev_guid);
 			break;
 		case VDEV_PROP_FAILFAST:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_n = intval;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_t = intval;
 			break;
 		case VDEV_PROP_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_n = intval;
 			break;
 		case VDEV_PROP_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_t = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_n = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_t = intval;
 			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
 		}
 end:
 		if (error != 0) {
 			intval = error;
 			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
 			return (error);
 		}
 	}
 
 	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
 	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = 0;
 	uint64_t objid;
 	uint64_t vdev_guid;
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;
 	uint64_t intval = 0;
 	char *strval = NULL;
 	const char *propname = NULL;
 	vdev_prop_t prop;
 
 	ASSERT(vd != NULL);
 	ASSERT(mos != NULL);
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(objid != 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	if (nvprops != NULL) {
 		char namebuf[64] = { 0 };
 
 		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 			intval = 0;
 			strval = NULL;
 			propname = nvpair_name(elem);
 			prop = vdev_name_to_prop(propname);
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			uint64_t integer_size, num_integers;
 
 			switch (prop) {
 			/* Special Read-only Properties */
 			case VDEV_PROP_NAME:
 				strval = vdev_name(vd, namebuf,
 				    sizeof (namebuf));
 				if (strval == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CAPACITY:
 				/* percent used */
 				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
 				    (vd->vdev_stat.vs_alloc * 100 /
 				    vd->vdev_stat.vs_dspace);
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_STATE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_state, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_GUID:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_guid, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_asize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PSIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_psize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASHIFT:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_ashift, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace -
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ALLOCATED:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_EXPANDSZ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRAGMENTATION:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_fragmentation,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARITY:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PATH:
 				if (vd->vdev_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_DEVID:
 				if (vd->vdev_devid == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PHYS_PATH:
 				if (vd->vdev_physpath == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ENC_PATH:
 				if (vd->vdev_enc_sysfs_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRU:
 				if (vd->vdev_fru == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARENT:
 				if (vd->vdev_parent != NULL) {
 					strval = vdev_name(vd->vdev_parent,
 					    namebuf, sizeof (namebuf));
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_CHILDREN:
 				if (vd->vdev_children > 0)
 					strval = kmem_zalloc(ZAP_MAXVALUELEN,
 					    KM_SLEEP);
 				for (uint64_t i = 0; i < vd->vdev_children;
 				    i++) {
 					const char *vname;
 
 					vname = vdev_name(vd->vdev_child[i],
 					    namebuf, sizeof (namebuf));
 					if (vname == NULL)
 						vname = "(unknown)";
 					if (strlen(strval) > 0)
 						strlcat(strval, ",",
 						    ZAP_MAXVALUELEN);
 					strlcat(strval, vname, ZAP_MAXVALUELEN);
 				}
 				if (strval != NULL) {
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 					kmem_free(strval, ZAP_MAXVALUELEN);
 				}
 				continue;
 			case VDEV_PROP_NUMCHILDREN:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_children, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_READ_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_read_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_WRITE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_write_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CHECKSUM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_checksum_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_INITIALIZE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_initialize_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_TRIM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_trim_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SLOW_IOS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_slow_ios,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_removing, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_RAIDZ_EXPANDING:
 				/* Only expose this for raidz */
 				if (vd->vdev_ops == &vdev_raidz_ops) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_rz_expanding,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_has_trim,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			/* Numeric Properites */
 			case VDEV_PROP_ALLOCATING:
 				/* Leaf vdevs cannot have this property */
 				if (vd->vdev_mg == NULL &&
 				    vd->vdev_top != NULL) {
 					src = ZPROP_SRC_NONE;
 					intval = ZPROP_BOOLEAN_NA;
 				} else {
 					err = vdev_prop_get_int(vd, prop,
 					    &intval);
 					if (err && err != ENOENT)
 						break;
 
 					if (intval ==
 					    vdev_prop_default_numeric(prop))
 						src = ZPROP_SRC_DEFAULT;
 					else
 						src = ZPROP_SRC_LOCAL;
 				}
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
 				strval = NULL;
 
 				err = zap_lookup(mos, objid, nvpair_name(elem),
 				    sizeof (uint64_t), 1, &intval);
 				if (err == ENOENT) {
 					intval = vdev_prop_default_numeric(
 					    prop);
 					err = 0;
 				} else if (err) {
 					break;
 				}
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
 			case VDEV_PROP_IO_T:
 			case VDEV_PROP_SLOW_IO_N:
 			case VDEV_PROP_SLOW_IO_T:
 				err = vdev_prop_get_int(vd, prop, &intval);
 				if (err && err != ENOENT)
 					break;
 
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 				else
 					src = ZPROP_SRC_LOCAL;
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			/* Text Properties */
 			case VDEV_PROP_COMMENT:
 				/* Exists in the ZAP below */
 				/* FALLTHRU */
 			case VDEV_PROP_USERPROP:
 				/* User Properites */
 				src = ZPROP_SRC_LOCAL;
 
 				err = zap_length(mos, objid, nvpair_name(elem),
 				    &integer_size, &num_integers);
 				if (err)
 					break;
 
 				switch (integer_size) {
 				case 8:
 					/* User properties cannot be integers */
 					err = EINVAL;
 					break;
 				case 1:
 					/* string property */
 					strval = kmem_alloc(num_integers,
 					    KM_SLEEP);
 					err = zap_lookup(mos, objid,
 					    nvpair_name(elem), 1,
 					    num_integers, strval);
 					if (err) {
 						kmem_free(strval,
 						    num_integers);
 						break;
 					}
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, src);
 					kmem_free(strval, num_integers);
 					break;
 				}
 				break;
 			default:
 				err = ENOENT;
 				break;
 			}
 			if (err)
 				break;
 		}
 	} else {
 		/*
 		 * Get all properties from the MOS vdev property object.
 		 */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, mos, objid);
 		    (err = zap_cursor_retrieve(&zc, za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			intval = 0;
 			strval = NULL;
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			propname = za->za_name;
 
 			switch (za->za_integer_length) {
 			case 8:
 				/* We do not allow integer user properties */
 				/* This is likely an internal value */
 				break;
 			case 1:
 				/* string property */
 				strval = kmem_alloc(za->za_num_integers,
 				    KM_SLEEP);
 				err = zap_lookup(mos, objid, za->za_name, 1,
 				    za->za_num_integers, strval);
 				if (err) {
 					kmem_free(strval, za->za_num_integers);
 					break;
 				}
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    src);
 				kmem_free(strval, za->za_num_integers);
 				break;
 
 			default:
 				break;
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	if (err && err != ENOENT) {
 		return (err);
 	}
 
 	return (0);
 }
 
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
 	"Default lower limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
 	"Default upper limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
 	"Rate limit hung IO (deadman) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
 	"Rate Direct I/O write verify events to this many per second");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
 	"Direct I/O writes will perform for checksum verification before "
 	"commiting write");
 
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below ZED threshold).");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 	"Bypass vdev_validate()");
 
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
 	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
-/* END CSTYLED */
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index e3dba0257b21..cd24f97ae7cd 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1,1925 +1,1923 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/metaslab.h>
 #include <sys/dmu.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
 #include <sys/abd.h>
 #include <sys/zthr.h>
 #include <sys/fm/fs/zfs.h>
 
 /*
  * An indirect vdev corresponds to a vdev that has been removed.  Since
  * we cannot rewrite block pointers of snapshots, etc., we keep a
  * mapping from old location on the removed device to the new location
  * on another device in the pool and use this mapping whenever we need
  * to access the DVA.  Unfortunately, this mapping did not respect
  * logical block boundaries when it was first created, and so a DVA on
  * this indirect vdev may be "split" into multiple sections that each
  * map to a different location.  As a consequence, not all DVAs can be
  * translated to an equivalent new DVA.  Instead we must provide a
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
  *  - I/Os to this vdev use the callback to determine where the
  *    data is now located, and issue child I/Os for each segment's new
  *    location.
  *
  *  - frees and claims to this vdev use the callback to free or claim
  *    each mapped segment.  (Note that we don't actually need to claim
  *    log blocks on indirect vdevs, because we don't allocate to
  *    removing vdevs.  However, zdb uses zio_claim() for its leak
  *    detection.)
  */
 
 /*
  * "Big theory statement" for how we mark blocks obsolete.
  *
  * When a block on an indirect vdev is freed or remapped, a section of
  * that vdev's mapping may no longer be referenced (aka "obsolete").  We
  * keep track of how much of each mapping entry is obsolete.  When
  * an entry becomes completely obsolete, we can remove it, thus reducing
  * the memory used by the mapping.  The complete picture of obsolescence
  * is given by the following data structures, described below:
  *  - the entry-specific obsolete count
  *  - the vdev-specific obsolete spacemap
  *  - the pool-specific obsolete bpobj
  *
  * == On disk data structures used ==
  *
  * We track the obsolete space for the pool using several objects.  Each
  * of these objects is created on demand and freed when no longer
  * needed, and is assumed to be empty if it does not exist.
  * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
  *
  *  - Each vic_mapping_object (associated with an indirect vdev) can
  *    have a vimp_counts_object.  This is an array of uint32_t's
  *    with the same number of entries as the vic_mapping_object.  When
  *    the mapping is condensed, entries from the vic_obsolete_sm_object
  *    (see below) are folded into the counts.  Therefore, each
  *    obsolete_counts entry tells us the number of bytes in the
  *    corresponding mapping entry that were not referenced when the
  *    mapping was last condensed.
  *
  *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
  *    This is a space map containing an alloc entry for every DVA that
  *    has been obsoleted since the last time this indirect vdev was
  *    condensed.  We use this object in order to improve performance
  *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
  *    offset of the vimp_counts_object, we only need to append an entry
  *    to the end of this object.  When a DVA becomes obsolete, it is
  *    added to the obsolete space map.  This happens when the DVA is
  *    freed, remapped and not referenced by a snapshot, or the last
  *    snapshot referencing it is destroyed.
  *
  *  - Each dataset can have a ds_remap_deadlist object.  This is a
  *    deadlist object containing all blocks that were remapped in this
  *    dataset but referenced in a previous snapshot.  Blocks can *only*
  *    appear on this list if they were remapped (dsl_dataset_block_remapped);
  *    blocks that were killed in a head dataset are put on the normal
  *    ds_deadlist and marked obsolete when they are freed.
  *
  *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
  *    in the pool that need to be marked obsolete.  When a snapshot is
  *    destroyed, we move some of the ds_remap_deadlist to the obsolete
  *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
  *    asynchronously process the obsolete bpobj, moving its entries to
  *    the specific vdevs' obsolete space maps.
  *
  * == Summary of how we mark blocks as obsolete ==
  *
  * - When freeing a block: if any DVA is on an indirect vdev, append to
  *   vic_obsolete_sm_object.
  * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
  *   references; otherwise append to vic_obsolete_sm_object).
  * - When freeing a snapshot: move parts of ds_remap_deadlist to
  *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
  * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
  *   individual vdev's vic_obsolete_sm_object.
  */
 
 /*
  * "Big theory statement" for how we condense indirect vdevs.
  *
  * Condensing an indirect vdev's mapping is the process of determining
  * the precise counts of obsolete space for each mapping entry (by
  * integrating the obsolete spacemap into the obsolete counts) and
  * writing out a new mapping that contains only referenced entries.
  *
  * We condense a vdev when we expect the mapping to shrink (see
  * vdev_indirect_should_condense()), but only perform one condense at a
  * time to limit the memory usage.  In addition, we use a separate
  * open-context thread (spa_condense_indirect_thread) to incrementally
  * create the new mapping object in a way that minimizes the impact on
  * the rest of the system.
  *
  * == Generating a new mapping ==
  *
  * To generate a new mapping, we follow these steps:
  *
  * 1. Save the old obsolete space map and create a new mapping object
  *    (see spa_condense_indirect_start_sync()).  This initializes the
  *    spa_condensing_indirect_phys with the "previous obsolete space map",
  *    which is now read only.  Newly obsolete DVAs will be added to a
  *    new (initially empty) obsolete space map, and will not be
  *    considered as part of this condense operation.
  *
  * 2. Construct in memory the precise counts of obsolete space for each
  *    mapping entry, by incorporating the obsolete space map into the
  *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
  *
  * 3. Iterate through each mapping entry, writing to the new mapping any
  *    entries that are not completely obsolete (i.e. which don't have
  *    obsolete count == mapping length).  (See
  *    spa_condense_indirect_generate_new_mapping().)
  *
  * 4. Destroy the old mapping object and switch over to the new one
  *    (spa_condense_indirect_complete_sync).
  *
  * == Restarting from failure ==
  *
  * To restart the condense when we import/open the pool, we must start
  * at the 2nd step above: reconstruct the precise counts in memory,
  * based on the space map + counts.  Then in the 3rd step, we start
  * iterating where we left off: at vimp_max_offset of the new mapping
  * object.
  */
 
 static int zfs_condense_indirect_vdevs_enable = B_TRUE;
 
 /*
  * Condense if at least this percent of the bytes in the mapping is
  * obsolete.  With the default of 25%, the amount of space mapped
  * will be reduced to 1% of its original size after at most 16
  * condenses.  Higher values will condense less often (causing less
  * i/o); lower values will reduce the mapping size more quickly.
  */
 static uint_t zfs_condense_indirect_obsolete_pct = 25;
 
 /*
  * Condense if the obsolete space map takes up more than this amount of
  * space on disk (logically).  This limits the amount of disk space
  * consumed by the obsolete space map; the default of 1GB is small enough
  * that we typically don't mind "wasting" it.
  */
 static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
 
 /*
  * Don't bother condensing if the mapping uses less than this amount of
  * memory.  The default of 128KB is considered a "trivial" amount of
  * memory and not worth reducing.
  */
 static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a condense (which might otherwise
  * complete too quickly).  If used to reduce the performance impact of
  * condensing in production, a maximum value of 1 should be sufficient.
  */
 static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0;
 
 /*
  * If an indirect split block contains more than this many possible unique
  * combinations when being reconstructed, consider it too computationally
  * expensive to check them all. Instead, try at most 100 randomly-selected
  * combinations each time the block is accessed.  This allows all segment
  * copies to participate fairly in the reconstruction when all combinations
  * cannot be checked and prevents repeated use of one bad copy.
  */
 uint_t zfs_reconstruct_indirect_combinations_max = 4096;
 
 /*
  * Enable to simulate damaged segments and validate reconstruction.  This
  * is intentionally not exposed as a module parameter.
  */
 unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
 
 /*
  * The indirect_child_t represents the vdev that we will read from, when we
  * need to read all copies of the data (e.g. for scrub or reconstruction).
  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
  * ic_vdev is a child of the mirror.
  */
 typedef struct indirect_child {
 	abd_t *ic_data;
 	vdev_t *ic_vdev;
 
 	/*
 	 * ic_duplicate is NULL when the ic_data contents are unique, when it
 	 * is determined to be a duplicate it references the primary child.
 	 */
 	struct indirect_child *ic_duplicate;
 	list_node_t ic_node; /* node on is_unique_child */
 	int ic_error; /* set when a child does not contain the data */
 } indirect_child_t;
 
 /*
  * The indirect_split_t represents one mapped segment of an i/o to the
  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
  * For split blocks, there will be several of these.
  */
 typedef struct indirect_split {
 	list_node_t is_node; /* link on iv_splits */
 
 	/*
 	 * is_split_offset is the offset into the i/o.
 	 * This is the sum of the previous splits' is_size's.
 	 */
 	uint64_t is_split_offset;
 
 	vdev_t *is_vdev; /* top-level vdev */
 	uint64_t is_target_offset; /* offset on is_vdev */
 	uint64_t is_size;
 	int is_children; /* number of entries in is_child[] */
 	int is_unique_children; /* number of entries in is_unique_child */
 	list_t is_unique_child;
 
 	/*
 	 * is_good_child is the child that we are currently using to
 	 * attempt reconstruction.
 	 */
 	indirect_child_t *is_good_child;
 
 	indirect_child_t is_child[];
 } indirect_split_t;
 
 /*
  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
  */
 typedef struct indirect_vsd {
 	boolean_t iv_split_block;
 	boolean_t iv_reconstruct;
 	uint64_t iv_unique_combinations;
 	uint64_t iv_attempts;
 	uint64_t iv_attempts_max;
 
 	list_t iv_splits; /* list of indirect_split_t's */
 } indirect_vsd_t;
 
 static void
 vdev_indirect_map_free(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	indirect_split_t *is;
 	while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 			if (ic->ic_data != NULL)
 				abd_free(ic->ic_data);
 		}
 
 		indirect_child_t *ic;
 		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
 			;
 
 		list_destroy(&is->is_unique_child);
 
 		kmem_free(is,
 		    offsetof(indirect_split_t, is_child[is->is_children]));
 	}
 	kmem_free(iv, sizeof (*iv));
 }
 
 static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
 	.vsd_free = vdev_indirect_map_free,
 };
 
 /*
  * Mark the given offset and size as being obsolete.
  */
 void
 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(size > 0);
 	VERIFY(vdev_indirect_mapping_entry_for_offset(
 	    vd->vdev_indirect_mapping, offset) != NULL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		mutex_enter(&vd->vdev_obsolete_lock);
 		range_tree_add(vd->vdev_obsolete_segments, offset, size);
 		mutex_exit(&vd->vdev_obsolete_lock);
 		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
 	}
 }
 
 /*
  * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
  * wrapper is provided because the DMU does not know about vdev_t's and
  * cannot directly call vdev_indirect_mark_obsolete.
  */
 void
 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/* The DMU can only remap indirect vdevs. */
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	vdev_indirect_mark_obsolete(vd, offset, size);
 }
 
 static spa_condensing_indirect_t *
 spa_condensing_indirect_create(spa_t *spa)
 {
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
 	objset_t *mos = spa->spa_meta_objset;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&sci->sci_new_mapping_entries[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
 	}
 
 	sci->sci_new_mapping =
 	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
 
 	return (sci);
 }
 
 static void
 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
 {
 	for (int i = 0; i < TXG_SIZE; i++)
 		list_destroy(&sci->sci_new_mapping_entries[i]);
 
 	if (sci->sci_new_mapping != NULL)
 		vdev_indirect_mapping_close(sci->sci_new_mapping);
 
 	kmem_free(sci, sizeof (*sci));
 }
 
 boolean_t
 vdev_indirect_should_condense(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
 
 	if (!zfs_condense_indirect_vdevs_enable)
 		return (B_FALSE);
 
 	/*
 	 * We can only condense one indirect vdev at a time.
 	 */
 	if (spa->spa_condensing_indirect != NULL)
 		return (B_FALSE);
 
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 
 	/*
 	 * The mapping object size must not change while we are
 	 * condensing, so we can only condense indirect vdevs
 	 * (not vdevs that are still in the middle of being removed).
 	 */
 	if (vd->vdev_ops != &vdev_indirect_ops)
 		return (B_FALSE);
 
 	/*
 	 * If nothing new has been marked obsolete, there is no
 	 * point in condensing.
 	 */
 	uint64_t obsolete_sm_obj __maybe_unused;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
 	if (vd->vdev_obsolete_sm == NULL) {
 		ASSERT0(obsolete_sm_obj);
 		return (B_FALSE);
 	}
 
 	ASSERT(vd->vdev_obsolete_sm != NULL);
 
 	ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm));
 
 	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
 	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
 	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
 	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
 
 	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
 
 	/*
 	 * If a high percentage of the bytes that are mapped have become
 	 * obsolete, condense (unless the mapping is already small enough).
 	 * This has a good chance of reducing the amount of memory used
 	 * by the mapping.
 	 */
 	if (bytes_obsolete * 100 / bytes_mapped >=
 	    zfs_condense_indirect_obsolete_pct &&
 	    mapping_size > zfs_condense_min_mapping_bytes) {
 		zfs_dbgmsg("should condense vdev %llu because obsolete "
 		    "spacemap covers %d%% of %lluMB mapping",
 		    (u_longlong_t)vd->vdev_id,
 		    (int)(bytes_obsolete * 100 / bytes_mapped),
 		    (u_longlong_t)bytes_mapped / 1024 / 1024);
 		return (B_TRUE);
 	}
 
 	/*
 	 * If the obsolete space map takes up too much space on disk,
 	 * condense in order to free up this disk space.
 	 */
 	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
 		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
 		    "length %lluMB >= max size %lluMB",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
 		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
 		    1024 / 1024);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * This sync task completes (finishes) a condense, deleting the old
  * mapping and replacing it with the new one.
  */
 static void
 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_condensing_indirect_t *sci = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
 	uint64_t new_count =
 	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
 	}
 	ASSERT(vic->vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
 	ASSERT(scip->scip_next_mapping_object != 0);
 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 
 	/*
 	 * Reset vdev_indirect_mapping to refer to the new object.
 	 */
 	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 	vd->vdev_indirect_mapping = sci->sci_new_mapping;
 	rw_exit(&vd->vdev_indirect_rwlock);
 
 	sci->sci_new_mapping = NULL;
 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
 	vic->vic_mapping_object = scip->scip_next_mapping_object;
 	scip->scip_next_mapping_object = 0;
 
 	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	scip->scip_prev_obsolete_sm_object = 0;
 
 	scip->scip_vdev = 0;
 
 	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, tx));
 	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
 	spa->spa_condensing_indirect = NULL;
 
 	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
 	    "new mapping object %llu has %llu entries "
 	    "(was %llu entries)",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)vic->vic_mapping_object,
 	    (u_longlong_t)new_count, (u_longlong_t)old_count);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * This sync task appends entries to the new mapping object.
  */
 static void
 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_condensing_indirect_t *sci = arg;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
 
 	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
 	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
 	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
 }
 
 /*
  * Open-context function to add one entry to the new mapping.  The new
  * entry will be remembered and written from syncing context.
  */
 static void
 spa_condense_indirect_commit_entry(spa_t *spa,
     vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
 {
 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
 
 	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	/*
 	 * If we are the first entry committed this txg, kick off the sync
 	 * task to write to the MOS on our behalf.
 	 */
 	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx),
 		    spa_condense_indirect_commit_sync, sci, tx);
 	}
 
 	vdev_indirect_mapping_entry_t *vime =
 	    kmem_alloc(sizeof (*vime), KM_SLEEP);
 	vime->vime_mapping = *vimep;
 	vime->vime_obsolete_count = count;
 	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
 
 	dmu_tx_commit(tx);
 }
 
 static void
 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
     uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t mapi = start_index;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	uint64_t old_num_entries =
 	    vdev_indirect_mapping_num_entries(old_mapping);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
 
 	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
 	    (u_longlong_t)vd->vdev_id,
 	    (u_longlong_t)mapi);
 
 	while (mapi < old_num_entries) {
 
 		if (zthr_iscancelled(zthr)) {
 			zfs_dbgmsg("pausing condense of vdev %llu "
 			    "at index %llu", (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)mapi);
 			break;
 		}
 
 		vdev_indirect_mapping_entry_phys_t *entry =
 		    &old_mapping->vim_entries[mapi];
 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
 		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
 		if (obsolete_counts[mapi] < entry_size) {
 			spa_condense_indirect_commit_entry(spa, entry,
 			    obsolete_counts[mapi]);
 
 			/*
 			 * This delay may be requested for testing, debugging,
 			 * or performance reasons.
 			 */
 			hrtime_t now = gethrtime();
 			hrtime_t sleep_until = now + MSEC2NSEC(
 			    zfs_condense_indirect_commit_entry_delay_ms);
 			zfs_sleep_until(sleep_until);
 		}
 
 		mapi++;
 	}
 }
 
 static boolean_t
 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_condensing_indirect != NULL);
 }
 
 static void
 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_t *vd;
 
 	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint32_t *counts;
 	uint64_t start_index;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	space_map_t *prev_obsolete_sm = NULL;
 
 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
 	ASSERT(scip->scip_next_mapping_object != 0);
 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * The list must start out empty in order for the
 		 * _commit_sync() sync task to be properly registered
 		 * on the first call to _commit_entry(); so it's wise
 		 * to double check and ensure we actually are starting
 		 * with empty lists.
 		 */
 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
 	}
 
 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
 	if (prev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
 		    counts, prev_obsolete_sm);
 	}
 	space_map_close(prev_obsolete_sm);
 
 	/*
 	 * Generate new mapping.  Determine what index to continue from
 	 * based on the max offset that we've already written in the
 	 * new mapping.
 	 */
 	uint64_t max_offset =
 	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
 	if (max_offset == 0) {
 		/* We haven't written anything to the new mapping yet. */
 		start_index = 0;
 	} else {
 		/*
 		 * Pick up from where we left off. _entry_for_offset()
 		 * returns a pointer into the vim_entries array. If
 		 * max_offset is greater than any of the mappings
 		 * contained in the table  NULL will be returned and
 		 * that indicates we've exhausted our iteration of the
 		 * old_mapping.
 		 */
 
 		vdev_indirect_mapping_entry_phys_t *entry =
 		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
 		    max_offset);
 
 		if (entry == NULL) {
 			/*
 			 * We've already written the whole new mapping.
 			 * This special value will cause us to skip the
 			 * generate_new_mapping step and just do the sync
 			 * task to complete the condense.
 			 */
 			start_index = UINT64_MAX;
 		} else {
 			start_index = entry - old_mapping->vim_entries;
 			ASSERT3U(start_index, <,
 			    vdev_indirect_mapping_num_entries(old_mapping));
 		}
 	}
 
 	spa_condense_indirect_generate_new_mapping(vd, counts,
 	    start_index, zthr);
 
 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
 
 	/*
 	 * If the zthr has received a cancellation signal while running
 	 * in generate_new_mapping() or at any point after that, then bail
 	 * early. We don't want to complete the condense if the spa is
 	 * shutting down.
 	 */
 	if (zthr_iscancelled(zthr))
 		return;
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    spa_condense_indirect_complete_sync, sci, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /*
  * Sync task to begin the condensing process.
  */
 void
 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 
 	ASSERT0(scip->scip_next_mapping_object);
 	ASSERT0(scip->scip_prev_obsolete_sm_object);
 	ASSERT0(scip->scip_vdev);
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
 	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
 
 	uint64_t obsolete_sm_obj;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
 	ASSERT3U(obsolete_sm_obj, !=, 0);
 
 	scip->scip_vdev = vd->vdev_id;
 	scip->scip_next_mapping_object =
 	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
 
 	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
 
 	/*
 	 * We don't need to allocate a new space map object, since
 	 * vdev_indirect_sync_obsolete will allocate one when needed.
 	 */
 	space_map_close(vd->vdev_obsolete_sm);
 	vd->vdev_obsolete_sm = NULL;
 	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
 
 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
 	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
 
 	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
 	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
 
 	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
 	    "posm=%llu nm=%llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
 	    (u_longlong_t)scip->scip_next_mapping_object);
 
 	zthr_wakeup(spa->spa_condense_zthr);
 }
 
 /*
  * Sync to the given vdev's obsolete space map any segments that are no longer
  * referenced as of the given txg.
  *
  * If the obsolete space map doesn't exist yet, create and open it.
  */
 void
 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
 
 	ASSERT3U(vic->vic_mapping_object, !=, 0);
 	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object == 0) {
 		obsolete_sm_object = space_map_alloc(spa->spa_meta_objset,
 		    zfs_vdev_standard_sm_blksz, tx);
 
 		ASSERT(vd->vdev_top_zap != 0);
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
 		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
 		ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		ASSERT3U(obsolete_sm_object, !=, 0);
 
 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
 		    spa->spa_meta_objset, obsolete_sm_object,
 		    0, vd->vdev_asize, 0));
 	}
 
 	ASSERT(vd->vdev_obsolete_sm != NULL);
 	ASSERT3U(obsolete_sm_object, ==,
 	    space_map_object(vd->vdev_obsolete_sm));
 
 	space_map_write(vd->vdev_obsolete_sm,
 	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 }
 
 int
 spa_condense_init(spa_t *spa)
 {
 	int error = zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
 	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
 	    &spa->spa_condensing_indirect_phys);
 	if (error == 0) {
 		if (spa_writeable(spa)) {
 			spa->spa_condensing_indirect =
 			    spa_condensing_indirect_create(spa);
 		}
 		return (0);
 	} else if (error == ENOENT) {
 		return (0);
 	} else {
 		return (error);
 	}
 }
 
 void
 spa_condense_fini(spa_t *spa)
 {
 	if (spa->spa_condensing_indirect != NULL) {
 		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
 		spa->spa_condensing_indirect = NULL;
 	}
 }
 
 void
 spa_start_indirect_condensing_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
 	spa->spa_condense_zthr = zthr_create("z_indirect_condense",
 	    spa_condense_indirect_thread_check,
 	    spa_condense_indirect_thread, spa, minclsyspri);
 }
 
 /*
  * Gets the obsolete spacemap object from the vdev's ZAP.  On success sm_obj
  * will contain either the obsolete spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 /*
  * Gets the obsolete count are precise spacemap object from the vdev's ZAP.
  * On success are_precise will be set to reflect if the counts are precise.
  * All other errors are returned to the caller.
  */
 int
 vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*are_precise = B_FALSE;
 		return (0);
 	}
 
 	uint64_t val = 0;
 	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
 	if (error == 0) {
 		*are_precise = (val != 0);
 	} else if (error == ENOENT) {
 		*are_precise = B_FALSE;
 		error = 0;
 	}
 
 	return (error);
 }
 
 static void
 vdev_indirect_close(vdev_t *vd)
 {
 	(void) vd;
 }
 
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	*psize = *max_psize = vd->vdev_asize +
 	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	*logical_ashift = vd->vdev_ashift;
 	*physical_ashift = vd->vdev_physical_ashift;
 	return (0);
 }
 
 typedef struct remap_segment {
 	vdev_t *rs_vd;
 	uint64_t rs_offset;
 	uint64_t rs_asize;
 	uint64_t rs_split_offset;
 	list_node_t rs_node;
 } remap_segment_t;
 
 static remap_segment_t *
 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
 {
 	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
 	rs->rs_vd = vd;
 	rs->rs_offset = offset;
 	rs->rs_asize = asize;
 	rs->rs_split_offset = split_offset;
 	return (rs);
 }
 
 /*
  * Given an indirect vdev and an extent on that vdev, it duplicates the
  * physical entries of the indirect mapping that correspond to the extent
  * to a new array and returns a pointer to it. In addition, copied_entries
  * is populated with the number of mapping entries that were duplicated.
  *
  * Note that the function assumes that the caller holds vdev_indirect_rwlock.
  * This ensures that the mapping won't change due to condensing as we
  * copy over its contents.
  *
  * Finally, since we are doing an allocation, it is up to the caller to
  * free the array allocated in this function.
  */
 static vdev_indirect_mapping_entry_phys_t *
 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
     uint64_t asize, uint64_t *copied_entries)
 {
 	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t entries = 0;
 
 	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
 
 	vdev_indirect_mapping_entry_phys_t *first_mapping =
 	    vdev_indirect_mapping_entry_for_offset(vim, offset);
 	ASSERT3P(first_mapping, !=, NULL);
 
 	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
 	while (asize > 0) {
 		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 
 		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
 		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 
 		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
 		uint64_t inner_size = MIN(asize, size - inner_offset);
 
 		offset += inner_size;
 		asize -= inner_size;
 		entries++;
 		m++;
 	}
 
 	size_t copy_length = entries * sizeof (*first_mapping);
 	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
 	memcpy(duplicate_mappings, first_mapping, copy_length);
 	*copied_entries = entries;
 
 	return (duplicate_mappings);
 }
 
 /*
  * Goes through the relevant indirect mappings until it hits a concrete vdev
  * and issues the callback. On the way to the concrete vdev, if any other
  * indirect vdevs are encountered, then the callback will also be called on
  * each of those indirect vdevs. For example, if the segment is mapped to
  * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
  * mapped to segment B on concrete vdev 2, then the callback will be called on
  * both vdev 1 and vdev 2.
  *
  * While the callback passed to vdev_indirect_remap() is called on every vdev
  * the function encounters, certain callbacks only care about concrete vdevs.
  * These types of callbacks should return immediately and explicitly when they
  * are called on an indirect vdev.
  *
  * Because there is a possibility that a DVA section in the indirect device
  * has been split into multiple sections in our mapping, we keep track
  * of the relevant contiguous segments of the new location (remap_segment_t)
  * in a stack. This way we can call the callback for each of the new sections
  * created by a single section of the indirect device. Note though, that in
  * this scenario the callbacks in each split block won't occur in-order in
  * terms of offset, so callers should not make any assumptions about that.
  *
  * For callbacks that don't handle split blocks and immediately return when
  * they encounter them (as is the case for remap_blkptr_cb), the caller can
  * assume that its callback will be applied from the first indirect vdev
  * encountered to the last one and then the concrete vdev, in that order.
  */
 static void
 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
     void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
 {
 	list_t stack;
 	spa_t *spa = vd->vdev_spa;
 
 	list_create(&stack, sizeof (remap_segment_t),
 	    offsetof(remap_segment_t, rs_node));
 
 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
 	    rs != NULL; rs = list_remove_head(&stack)) {
 		vdev_t *v = rs->rs_vd;
 		uint64_t num_entries = 0;
 
 		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 		ASSERT(rs->rs_asize > 0);
 
 		/*
 		 * Note: As this function can be called from open context
 		 * (e.g. zio_read()), we need the following rwlock to
 		 * prevent the mapping from being changed by condensing.
 		 *
 		 * So we grab the lock and we make a copy of the entries
 		 * that are relevant to the extent that we are working on.
 		 * Once that is done, we drop the lock and iterate over
 		 * our copy of the mapping. Once we are done with the with
 		 * the remap segment and we free it, we also free our copy
 		 * of the indirect mapping entries that are relevant to it.
 		 *
 		 * This way we don't need to wait until the function is
 		 * finished with a segment, to condense it. In addition, we
 		 * don't need a recursive rwlock for the case that a call to
 		 * vdev_indirect_remap() needs to call itself (through the
 		 * codepath of its callback) for the same vdev in the middle
 		 * of its execution.
 		 */
 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
 		ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
 
 		vdev_indirect_mapping_entry_phys_t *mapping =
 		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
 		    rs->rs_offset, rs->rs_asize, &num_entries);
 		ASSERT3P(mapping, !=, NULL);
 		ASSERT3U(num_entries, >, 0);
 		rw_exit(&v->vdev_indirect_rwlock);
 
 		for (uint64_t i = 0; i < num_entries; i++) {
 			/*
 			 * Note: the vdev_indirect_mapping can not change
 			 * while we are running.  It only changes while the
 			 * removal is in progress, and then only from syncing
 			 * context. While a removal is in progress, this
 			 * function is only called for frees, which also only
 			 * happen from syncing context.
 			 */
 			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
 
 			ASSERT3P(m, !=, NULL);
 			ASSERT3U(rs->rs_asize, >, 0);
 
 			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
 			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
 
 			ASSERT3U(rs->rs_offset, >=,
 			    DVA_MAPPING_GET_SRC_OFFSET(m));
 			ASSERT3U(rs->rs_offset, <,
 			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 			ASSERT3U(dst_vdev, !=, v->vdev_id);
 
 			uint64_t inner_offset = rs->rs_offset -
 			    DVA_MAPPING_GET_SRC_OFFSET(m);
 			uint64_t inner_size =
 			    MIN(rs->rs_asize, size - inner_offset);
 
 			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
 			ASSERT3P(dst_v, !=, NULL);
 
 			if (dst_v->vdev_ops == &vdev_indirect_ops) {
 				list_insert_head(&stack,
 				    rs_alloc(dst_v, dst_offset + inner_offset,
 				    inner_size, rs->rs_split_offset));
 
 			}
 
 			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
 			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
 				/*
 				 * Note: This clause exists only solely for
 				 * testing purposes. We use it to ensure that
 				 * split blocks work and that the callbacks
 				 * using them yield the same result if issued
 				 * in reverse order.
 				 */
 				uint64_t inner_half = inner_size / 2;
 
 				func(rs->rs_split_offset + inner_half, dst_v,
 				    dst_offset + inner_offset + inner_half,
 				    inner_half, arg);
 
 				func(rs->rs_split_offset, dst_v,
 				    dst_offset + inner_offset,
 				    inner_half, arg);
 			} else {
 				func(rs->rs_split_offset, dst_v,
 				    dst_offset + inner_offset,
 				    inner_size, arg);
 			}
 
 			rs->rs_offset += inner_size;
 			rs->rs_asize -= inner_size;
 			rs->rs_split_offset += inner_size;
 		}
 		VERIFY0(rs->rs_asize);
 
 		kmem_free(mapping, num_entries * sizeof (*mapping));
 		kmem_free(rs, sizeof (remap_segment_t));
 	}
 	list_destroy(&stack);
 }
 
 static void
 vdev_indirect_child_io_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 
 	abd_free(zio->io_abd);
 }
 
 /*
  * This is a callback for vdev_indirect_remap() which allocates an
  * indirect_split_t for each split segment and adds it to iv_splits.
  */
 static void
 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	zio_t *zio = arg;
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3P(vd, !=, NULL);
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	int n = 1;
 	if (vd->vdev_ops == &vdev_mirror_ops)
 		n = vd->vdev_children;
 
 	indirect_split_t *is =
 	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
 
 	is->is_children = n;
 	is->is_size = size;
 	is->is_split_offset = split_offset;
 	is->is_target_offset = offset;
 	is->is_vdev = vd;
 	list_create(&is->is_unique_child, sizeof (indirect_child_t),
 	    offsetof(indirect_child_t, ic_node));
 
 	/*
 	 * Note that we only consider multiple copies of the data for
 	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
 	 * though they use the same ops as mirror, because there's only one
 	 * "good" copy under the replacing/spare.
 	 */
 	if (vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < n; i++) {
 			is->is_child[i].ic_vdev = vd->vdev_child[i];
 			list_link_init(&is->is_child[i].ic_node);
 		}
 	} else {
 		is->is_child[0].ic_vdev = vd;
 	}
 
 	list_insert_tail(&iv->iv_splits, is);
 }
 
 static void
 vdev_indirect_read_split_done(zio_t *zio)
 {
 	indirect_child_t *ic = zio->io_private;
 
 	if (zio->io_error != 0) {
 		/*
 		 * Clear ic_data to indicate that we do not have data for this
 		 * child.
 		 */
 		abd_free(ic->ic_data);
 		ic->ic_data = NULL;
 	}
 }
 
 /*
  * Issue reads for all copies (mirror children) of all splits.
  */
 static void
 vdev_indirect_read_all(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic = &is->is_child[i];
 
 			if (!vdev_readable(ic->ic_vdev))
 				continue;
 
 			/*
 			 * If a child is missing the data, set ic_error. Used
 			 * in vdev_indirect_repair(). We perform the read
 			 * nevertheless which provides the opportunity to
 			 * reconstruct the split block if at all possible.
 			 */
 			if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
 			    zio->io_txg, 1))
 				ic->ic_error = SET_ERROR(ESTALE);
 
 			ic->ic_data = abd_alloc_sametype(zio->io_abd,
 			    is->is_size);
 			ic->ic_duplicate = NULL;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
 			    is->is_size, zio->io_type, zio->io_priority, 0,
 			    vdev_indirect_read_split_done, ic));
 		}
 	}
 	iv->iv_reconstruct = B_TRUE;
 }
 
 static void
 vdev_indirect_io_start(zio_t *zio)
 {
 	spa_t *spa __maybe_unused = zio->io_spa;
 	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
 	list_create(&iv->iv_splits,
 	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
 
 	zio->io_vsd = iv;
 	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 	if (zio->io_type != ZIO_TYPE_READ) {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 		/*
 		 * Note: this code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
 		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
 	}
 
 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
 	    vdev_indirect_gather_splits, zio);
 
 	indirect_split_t *first = list_head(&iv->iv_splits);
 	ASSERT3P(first, !=, NULL);
 	if (first->is_size == zio->io_size) {
 		/*
 		 * This is not a split block; we are pointing to the entire
 		 * data, which will checksum the same as the original data.
 		 * Pass the BP down so that the child i/o can verify the
 		 * checksum, and try a different location if available
 		 * (e.g. on a mirror).
 		 *
 		 * While this special case could be handled the same as the
 		 * general (split block) case, doing it this way ensures
 		 * that the vast majority of blocks on indirect vdevs
 		 * (which are not split) are handled identically to blocks
 		 * on non-indirect vdevs.  This allows us to be less strict
 		 * about performance in the general (but rare) case.
 		 */
 		ASSERT0(first->is_split_offset);
 		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    first->is_vdev, first->is_target_offset,
 		    abd_get_offset(zio->io_abd, 0),
 		    zio->io_size, zio->io_type, zio->io_priority, 0,
 		    vdev_indirect_child_io_done, zio));
 	} else {
 		iv->iv_split_block = B_TRUE;
 		if (zio->io_type == ZIO_TYPE_READ &&
 		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
 			/*
 			 * Read all copies.  Note that for simplicity,
 			 * we don't bother consulting the DTL in the
 			 * resilver case.
 			 */
 			vdev_indirect_read_all(zio);
 		} else {
 			/*
 			 * If this is a read zio, we read one copy of each
 			 * split segment, from the top-level vdev.  Since
 			 * we don't know the checksum of each split
 			 * individually, the child zio can't ensure that
 			 * we get the right data. E.g. if it's a mirror,
 			 * it will just read from a random (healthy) leaf
 			 * vdev. We have to verify the checksum in
 			 * vdev_indirect_io_done().
 			 *
 			 * For write zios, the vdev code will ensure we write
 			 * to all children.
 			 */
 			for (indirect_split_t *is = list_head(&iv->iv_splits);
 			    is != NULL; is = list_next(&iv->iv_splits, is)) {
 				zio_nowait(zio_vdev_child_io(zio, NULL,
 				    is->is_vdev, is->is_target_offset,
 				    abd_get_offset_size(zio->io_abd,
 				    is->is_split_offset, is->is_size),
 				    is->is_size, zio->io_type,
 				    zio->io_priority, 0,
 				    vdev_indirect_child_io_done, zio));
 			}
 
 		}
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child.
  */
 static void
 vdev_indirect_checksum_error(zio_t *zio,
     indirect_split_t *is, indirect_child_t *ic)
 {
 	vdev_t *vd = ic->ic_vdev;
 
 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
 	    is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
 }
 
 /*
  * Issue repair i/os for any incorrect copies.  We do this by comparing
  * each split segment's correct data (is_good_child's ic_data) with each
  * other copy of the data.  If they differ, then we overwrite the bad data
  * with the good copy.  The DTL is checked in vdev_indirect_read_all() and
  * if a vdev is missing a copy of the data we set ic_error and the read is
  * performed. This provides the opportunity to reconstruct the split block
  * if at all possible. ic_error is checked here and if set it suppresses
  * incrementing the checksum counter. Aside from this DTLs are not checked,
  * which simplifies this code and also issues the optimal number of writes
  * (based on which copies actually read bad data, as opposed to which we
  * think might be wrong).  For the same reason, we always use
  * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
  */
 static void
 vdev_indirect_repair(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (!spa_writeable(zio->io_spa))
 		return;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 			if (ic == is->is_good_child)
 				continue;
 			if (ic->ic_data == NULL)
 				continue;
 			if (ic->ic_duplicate == is->is_good_child)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset,
 			    is->is_good_child->ic_data, is->is_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL));
 
 			/*
 			 * If ic_error is set the current child does not have
 			 * a copy of the data, so suppress incrementing the
 			 * checksum counter.
 			 */
 			if (ic->ic_error == ESTALE)
 				continue;
 
 			vdev_indirect_checksum_error(zio, is, ic);
 		}
 	}
 }
 
 /*
  * Report checksum errors on all children that we read from.
  */
 static void
 vdev_indirect_all_checksum_errors(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 
 			if (ic->ic_data == NULL)
 				continue;
 
 			vdev_t *vd = ic->ic_vdev;
 
 			mutex_enter(&vd->vdev_stat_lock);
 			vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&vd->vdev_stat_lock);
 			(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 			    NULL, zio, is->is_target_offset, is->is_size,
 			    NULL, NULL, NULL);
 		}
 	}
 }
 
 /*
  * Copy data from all the splits to a main zio then validate the checksum.
  * If then checksum is successfully validated return success.
  */
 static int
 vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
 {
 	zio_bad_cksum_t zbc;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 
 		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
 		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
 
 		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
 		    is->is_split_offset, 0, is->is_size);
 	}
 
 	return (zio_checksum_error(zio, &zbc));
 }
 
 /*
  * There are relatively few possible combinations making it feasible to
  * deterministically check them all.  We do this by setting the good_child
  * to the next unique split version.  If we reach the end of the list then
  * "carry over" to the next unique split version (like counting in base
  * is_unique_children, but each digit can have a different base).
  */
 static int
 vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
 {
 	boolean_t more = B_TRUE;
 
 	iv->iv_attempts = 0;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is))
 		is->is_good_child = list_head(&is->is_unique_child);
 
 	while (more == B_TRUE) {
 		iv->iv_attempts++;
 		more = B_FALSE;
 
 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
 			return (0);
 
 		for (indirect_split_t *is = list_head(&iv->iv_splits);
 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
 			is->is_good_child = list_next(&is->is_unique_child,
 			    is->is_good_child);
 			if (is->is_good_child != NULL) {
 				more = B_TRUE;
 				break;
 			}
 
 			is->is_good_child = list_head(&is->is_unique_child);
 		}
 	}
 
 	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
 
 	return (SET_ERROR(ECKSUM));
 }
 
 /*
  * There are too many combinations to try all of them in a reasonable amount
  * of time.  So try a fixed number of random combinations from the unique
  * split versions, after which we'll consider the block unrecoverable.
  */
 static int
 vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
 {
 	iv->iv_attempts = 0;
 
 	while (iv->iv_attempts < iv->iv_attempts_max) {
 		iv->iv_attempts++;
 
 		for (indirect_split_t *is = list_head(&iv->iv_splits);
 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
 			indirect_child_t *ic = list_head(&is->is_unique_child);
 			int children = is->is_unique_children;
 
 			for (int i = random_in_range(children); i > 0; i--)
 				ic = list_next(&is->is_unique_child, ic);
 
 			ASSERT3P(ic, !=, NULL);
 			is->is_good_child = ic;
 		}
 
 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
 			return (0);
 	}
 
 	return (SET_ERROR(ECKSUM));
 }
 
 /*
  * This is a validation function for reconstruction.  It randomly selects
  * a good combination, if one can be found, and then it intentionally
  * damages all other segment copes by zeroing them.  This forces the
  * reconstruction algorithm to locate the one remaining known good copy.
  */
 static int
 vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
 {
 	int error;
 
 	/* Presume all the copies are unique for initial selection. */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		is->is_unique_children = 0;
 
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic = &is->is_child[i];
 			if (ic->ic_data != NULL) {
 				is->is_unique_children++;
 				list_insert_tail(&is->is_unique_child, ic);
 			}
 		}
 
 		if (list_is_empty(&is->is_unique_child)) {
 			error = SET_ERROR(EIO);
 			goto out;
 		}
 	}
 
 	/*
 	 * Set each is_good_child to a randomly-selected child which
 	 * is known to contain validated data.
 	 */
 	error = vdev_indirect_splits_enumerate_randomly(iv, zio);
 	if (error)
 		goto out;
 
 	/*
 	 * Damage all but the known good copy by zeroing it.  This will
 	 * result in two or less unique copies per indirect_child_t.
 	 * Both may need to be checked in order to reconstruct the block.
 	 * Set iv->iv_attempts_max such that all unique combinations will
 	 * enumerated, but limit the damage to at most 12 indirect splits.
 	 */
 	iv->iv_attempts_max = 1;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 
 			if (ic == is->is_good_child)
 				continue;
 			if (ic->ic_data == NULL)
 				continue;
 
 			abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
 		}
 
 		iv->iv_attempts_max *= 2;
 		if (iv->iv_attempts_max >= (1ULL << 12)) {
 			iv->iv_attempts_max = UINT64_MAX;
 			break;
 		}
 	}
 
 out:
 	/* Empty the unique children lists so they can be reconstructed. */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		indirect_child_t *ic;
 		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
 			;
 
 		is->is_unique_children = 0;
 	}
 
 	return (error);
 }
 
 /*
  * This function is called when we have read all copies of the data and need
  * to try to find a combination of copies that gives us the right checksum.
  *
  * If we pointed to any mirror vdevs, this effectively does the job of the
  * mirror.  The mirror vdev code can't do its own job because we don't know
  * the checksum of each split segment individually.
  *
  * We have to try every unique combination of copies of split segments, until
  * we find one that checksums correctly.  Duplicate segment copies are first
  * identified and latter skipped during reconstruction.  This optimization
  * reduces the search space and ensures that of the remaining combinations
  * at most one is correct.
  *
  * When the total number of combinations is small they can all be checked.
  * For example, if we have 3 segments in the split, and each points to a
  * 2-way mirror with unique copies, we will have the following pieces of data:
  *
  *       |     mirror child
  * split |     [0]        [1]
  * ======|=====================
  *   A   |  data_A_0   data_A_1
  *   B   |  data_B_0   data_B_1
  *   C   |  data_C_0   data_C_1
  *
  * We will try the following (mirror children)^(number of splits) (2^3=8)
  * combinations, which is similar to bitwise-little-endian counting in
  * binary.  In general each "digit" corresponds to a split segment, and the
  * base of each digit is is_children, which can be different for each
  * digit.
  *
  * "low bit"        "high bit"
  *        v                 v
  * data_A_0 data_B_0 data_C_0
  * data_A_1 data_B_0 data_C_0
  * data_A_0 data_B_1 data_C_0
  * data_A_1 data_B_1 data_C_0
  * data_A_0 data_B_0 data_C_1
  * data_A_1 data_B_0 data_C_1
  * data_A_0 data_B_1 data_C_1
  * data_A_1 data_B_1 data_C_1
  *
  * Note that the split segments may be on the same or different top-level
  * vdevs. In either case, we may need to try lots of combinations (see
  * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
  * has small silent errors on all of its children, we can still reconstruct
  * the correct data, as long as those errors are at sufficiently-separated
  * offsets (specifically, separated by the largest block size - default of
  * 128KB, but up to 16MB).
  */
 static void
 vdev_indirect_reconstruct_io_done(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 	boolean_t known_good = B_FALSE;
 	int error;
 
 	iv->iv_unique_combinations = 1;
 	iv->iv_attempts_max = UINT64_MAX;
 
 	if (zfs_reconstruct_indirect_combinations_max > 0)
 		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
 
 	/*
 	 * If nonzero, every 1/x blocks will be damaged, in order to validate
 	 * reconstruction when there are split segments with damaged copies.
 	 * Known_good will be TRUE when reconstruction is known to be possible.
 	 */
 	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
 	    random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0)
 		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
 
 	/*
 	 * Determine the unique children for a split segment and add them
 	 * to the is_unique_child list.  By restricting reconstruction
 	 * to these children, only unique combinations will be considered.
 	 * This can vastly reduce the search space when there are a large
 	 * number of indirect splits.
 	 */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		is->is_unique_children = 0;
 
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic_i = &is->is_child[i];
 
 			if (ic_i->ic_data == NULL ||
 			    ic_i->ic_duplicate != NULL)
 				continue;
 
 			for (int j = i + 1; j < is->is_children; j++) {
 				indirect_child_t *ic_j = &is->is_child[j];
 
 				if (ic_j->ic_data == NULL ||
 				    ic_j->ic_duplicate != NULL)
 					continue;
 
 				if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
 					ic_j->ic_duplicate = ic_i;
 			}
 
 			is->is_unique_children++;
 			list_insert_tail(&is->is_unique_child, ic_i);
 		}
 
 		/* Reconstruction is impossible, no valid children */
 		EQUIV(list_is_empty(&is->is_unique_child),
 		    is->is_unique_children == 0);
 		if (list_is_empty(&is->is_unique_child)) {
 			zio->io_error = EIO;
 			vdev_indirect_all_checksum_errors(zio);
 			zio_checksum_verified(zio);
 			return;
 		}
 
 		iv->iv_unique_combinations *= is->is_unique_children;
 	}
 
 	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
 		error = vdev_indirect_splits_enumerate_all(iv, zio);
 	else
 		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
 
 	if (error != 0) {
 		/* All attempted combinations failed. */
 		ASSERT3B(known_good, ==, B_FALSE);
 		zio->io_error = error;
 		vdev_indirect_all_checksum_errors(zio);
 	} else {
 		/*
 		 * The checksum has been successfully validated.  Issue
 		 * repair I/Os to any copies of splits which don't match
 		 * the validated version.
 		 */
 		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
 		vdev_indirect_repair(zio);
 		zio_checksum_verified(zio);
 	}
 }
 
 static void
 vdev_indirect_io_done(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (iv->iv_reconstruct) {
 		/*
 		 * We have read all copies of the data (e.g. from mirrors),
 		 * either because this was a scrub/resilver, or because the
 		 * one-copy read didn't checksum correctly.
 		 */
 		vdev_indirect_reconstruct_io_done(zio);
 		return;
 	}
 
 	if (!iv->iv_split_block) {
 		/*
 		 * This was not a split block, so we passed the BP down,
 		 * and the checksum was handled by the (one) child zio.
 		 */
 		return;
 	}
 
 	zio_bad_cksum_t zbc;
 	int ret = zio_checksum_error(zio, &zbc);
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level VDEV.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		ret = 0;
 	}
 
 	if (ret == 0) {
 		zio_checksum_verified(zio);
 		return;
 	}
 
 	/*
 	 * The checksum didn't match.  Read all copies of all splits, and
 	 * then we will try to reconstruct.  The next time
 	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
 	 */
 	vdev_indirect_read_all(zio);
 
 	zio_vdev_io_redone(zio);
 }
 
 vdev_ops_t vdev_indirect_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_indirect_open,
 	.vdev_op_close = vdev_indirect_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_indirect_io_start,
 	.vdev_op_io_done = vdev_indirect_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = vdev_indirect_remap,
 	.vdev_op_xlate = NULL,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* leaf vdev */
 };
 
 EXPORT_SYMBOL(spa_condense_fini);
 EXPORT_SYMBOL(spa_start_indirect_condensing_thread);
 EXPORT_SYMBOL(spa_condense_indirect_start_sync);
 EXPORT_SYMBOL(spa_condense_init);
 EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete);
 EXPORT_SYMBOL(vdev_indirect_mark_obsolete);
 EXPORT_SYMBOL(vdev_indirect_should_condense);
 EXPORT_SYMBOL(vdev_indirect_sync_obsolete);
 EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
 EXPORT_SYMBOL(vdev_obsolete_sm_object);
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT,
 	ZMOD_RW, "Whether to attempt condensing indirect vdev mappings");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT,
 	ZMOD_RW,
 	"Minimum obsolete percent of bytes in the mapping "
 	"to attempt condensing");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW,
 	"Don't bother condensing if the mapping uses less than this amount of "
 	"memory");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64,
 	ZMOD_RW,
 	"Minimum size obsolete spacemap to attempt condensing");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms,
 	UINT, ZMOD_RW,
 	"Used by tests to ensure certain actions happen in the middle of a "
 	"condense. A maximum value of 1 should be sufficient.");
 
 ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max,
 	UINT, ZMOD_RW,
 	"Maximum number of combinations when reconstructing split segments");
-/* END CSTYLED */
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 65a840bf9728..850569d1a35e 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -1,1061 +1,1059 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 /*
  * Vdev mirror kstats
  */
 static kstat_t *mirror_ksp = NULL;
 
 typedef struct mirror_stats {
 	kstat_named_t vdev_mirror_stat_rotating_linear;
 	kstat_named_t vdev_mirror_stat_rotating_offset;
 	kstat_named_t vdev_mirror_stat_rotating_seek;
 	kstat_named_t vdev_mirror_stat_non_rotating_linear;
 	kstat_named_t vdev_mirror_stat_non_rotating_seek;
 
 	kstat_named_t vdev_mirror_stat_preferred_found;
 	kstat_named_t vdev_mirror_stat_preferred_not_found;
 } mirror_stats_t;
 
 static mirror_stats_t mirror_stats = {
 	/* New I/O follows directly the last I/O */
 	{ "rotating_linear",			KSTAT_DATA_UINT64 },
 	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
 	{ "rotating_offset",			KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek */
 	{ "rotating_seek",			KSTAT_DATA_UINT64 },
 	/* New I/O follows directly the last I/O  (nonrot) */
 	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek (nonrot) */
 	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev found */
 	{ "preferred_found",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev not found or equal load  */
 	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
 
 };
 
 #define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
 #define	MIRROR_INCR(stat, val) 		atomic_add_64(&MIRROR_STAT(stat), val)
 #define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
 
 void
 vdev_mirror_stat_init(void)
 {
 	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
 	    "misc", KSTAT_TYPE_NAMED,
 	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (mirror_ksp != NULL) {
 		mirror_ksp->ks_data = &mirror_stats;
 		kstat_install(mirror_ksp);
 	}
 }
 
 void
 vdev_mirror_stat_fini(void)
 {
 	if (mirror_ksp != NULL) {
 		kstat_delete(mirror_ksp);
 		mirror_ksp = NULL;
 	}
 }
 
 /*
  * Virtual device vector for mirroring.
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 	uint8_t		mc_rebuilding;
 } mirror_child_t;
 
 typedef struct mirror_map {
 	int		*mm_preferred;
 	int		mm_preferred_cnt;
 	int		mm_children;
 	boolean_t	mm_resilvering;
 	boolean_t	mm_rebuilding;
 	boolean_t	mm_root;
 	mirror_child_t	mm_child[];
 } mirror_map_t;
 
 static const int vdev_mirror_shift = 21;
 
 /*
  * The load configuration settings below are tuned by default for
  * the case where all devices are of the same rotational type.
  *
  * If there is a mixture of rotating and non-rotating media, setting
  * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
  * as it will direct more reads to the non-rotating vdevs which are more likely
  * to have a higher performance.
  */
 
 /* Rotating media load calculation configuration. */
 static int zfs_vdev_mirror_rotating_inc = 0;
 static int zfs_vdev_mirror_rotating_seek_inc = 5;
 static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
 
 /* Non-rotating media load calculation configuration. */
 static int zfs_vdev_mirror_non_rotating_inc = 0;
 static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
 
 static inline size_t
 vdev_mirror_map_size(int children)
 {
 	return (offsetof(mirror_map_t, mm_child[children]) +
 	    sizeof (int) * children);
 }
 
 static inline mirror_map_t *
 vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
 {
 	mirror_map_t *mm;
 
 	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
 	mm->mm_children = children;
 	mm->mm_resilvering = resilvering;
 	mm->mm_root = root;
 	mm->mm_preferred = (int *)((uintptr_t)mm +
 	    offsetof(mirror_map_t, mm_child[children]));
 
 	return (mm);
 }
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
 	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	.vsd_free = vdev_mirror_map_free,
 };
 
 static int
 vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
 {
 	uint64_t last_offset;
 	int64_t offset_diff;
 	int load;
 
 	/* All DVAs have equal weight at the root. */
 	if (mm->mm_root)
 		return (INT_MAX);
 
 	/*
 	 * We don't return INT_MAX if the device is resilvering i.e.
 	 * vdev_resilver_txg != 0 as when tested performance was slightly
 	 * worse overall when resilvering with compared to without.
 	 */
 
 	/* Fix zio_offset for leaf vdevs */
 	if (vd->vdev_ops->vdev_op_leaf)
 		zio_offset += VDEV_LABEL_START_SIZE;
 
 	/* Standard load based on pending queue length. */
 	load = vdev_queue_length(vd);
 	last_offset = vdev_queue_last_offset(vd);
 
 	if (vd->vdev_nonrot) {
 		/* Non-rotating media. */
 		if (last_offset == zio_offset) {
 			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
 			return (load + zfs_vdev_mirror_non_rotating_inc);
 		}
 
 		/*
 		 * Apply a seek penalty even for non-rotating devices as
 		 * sequential I/O's can be aggregated into fewer operations on
 		 * the device, thus avoiding unnecessary per-command overhead
 		 * and boosting performance.
 		 */
 		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
 		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
 	}
 
 	/* Rotating media I/O's which directly follow the last I/O. */
 	if (last_offset == zio_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
 		return (load + zfs_vdev_mirror_rotating_inc);
 	}
 
 	/*
 	 * Apply half the seek increment to I/O's within seek offset
 	 * of the last I/O issued to this vdev as they should incur less
 	 * of a seek increment.
 	 */
 	offset_diff = (int64_t)(last_offset - zio_offset);
 	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
 		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
 	}
 
 	/* Apply the full seek increment to all other I/O's. */
 	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
 	return (load + zfs_vdev_mirror_rotating_seek_inc);
 }
 
 static boolean_t
 vdev_mirror_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Avoid inlining the function to keep vdev_mirror_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline static mirror_map_t *
 vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
 	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		dva_t dva_copy[SPA_DVAS_PER_BP];
 
 		/*
 		 * The sequential scrub code sorts and issues all DVAs
 		 * of a bp separately. Each of these IOs includes all
 		 * original DVA copies so that repairs can be performed
 		 * in the event of an error, but we only actually want
 		 * to check the first DVA since the others will be
 		 * checked by their respective sorted IOs. Only if we
 		 * hit an error will we try all DVAs upon retrying.
 		 *
 		 * Note: This check is safe even if the user switches
 		 * from a legacy scrub to a sequential one in the middle
 		 * of processing, since scn_is_sorted isn't updated until
 		 * all outstanding IOs from the previous scrub pass
 		 * complete.
 		 */
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
 		    dsl_scan_scrubbing(spa->spa_dsl_pool) &&
 		    scn->scn_is_sorted) {
 			c = 1;
 		} else {
 			c = BP_GET_NDVAS(zio->io_bp);
 		}
 
 		/*
 		 * If the pool cannot be written to, then infer that some
 		 * DVAs might be invalid or point to vdevs that do not exist.
 		 * We skip them.
 		 */
 		if (!spa_writeable(spa)) {
 			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 			int j = 0;
 			for (int i = 0; i < c; i++) {
 				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
 					dva_copy[j++] = dva[i];
 			}
 			if (j == 0) {
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 			if (j < c) {
 				dva = dva_copy;
 				c = j;
 			}
 		}
 
 		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 			if (mc->mc_vd == NULL) {
 				kmem_free(mm, vdev_mirror_map_size(
 				    mm->mm_children));
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 		}
 	} else {
 		/*
 		 * If we are resilvering, then we should handle scrub reads
 		 * differently; we shouldn't issue them to the resilvering
 		 * device because it might not have those blocks.
 		 *
 		 * We are resilvering iff:
 		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
 		 *    "spare-1" or something like that), and
 		 * 2) The pool is currently being resilvered.
 		 *
 		 * We cannot simply check vd->vdev_resilver_txg, because it's
 		 * not set in this path.
 		 *
 		 * Nor can we just check our vdev_ops; there are cases (such as
 		 * when a user types "zpool replace pool odev spare_dev" and
 		 * spare_dev is in the spare list, or when a spare device is
 		 * automatically used to replace a DEGRADED device) when
 		 * resilvering is complete but both the original vdev and the
 		 * spare vdev remain in the pool.  That behavior is intentional.
 		 * It helps implement the policy that a spare should be
 		 * automatically removed from the pool after the user replaces
 		 * the device that originally failed.
 		 *
 		 * If a spa load is in progress, then spa_dsl_pool may be
 		 * uninitialized.  But we shouldn't be resilvering during a spa
 		 * load anyway.
 		 */
 		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops) &&
 		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
 		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
 		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
 		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
 
 			if (vdev_mirror_rebuilding(mc->mc_vd))
 				mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
 		}
 	}
 
 	return (mm);
 }
 
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
 		if (vdev_children_are_offline(vd))
 			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
 		else
 			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_mirror_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 static void
 vdev_mirror_child_done(zio_t *zio)
 {
 	mirror_child_t *mc = zio->io_private;
 
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
 	mc->mc_skipped = 0;
 }
 
 /*
  * Check the other, lower-index DVAs to see if they're on the same
  * vdev as the child we picked.  If they are, use them since they
  * are likely to have been allocated from the primary metaslab in
  * use at the time, and hence are more likely to have locality with
  * single-copy data.
  */
 static int
 vdev_mirror_dva_select(zio_t *zio, int p)
 {
 	dva_t *dva = zio->io_bp->blk_dva;
 	mirror_map_t *mm = zio->io_vsd;
 	int preferred;
 	int c;
 
 	preferred = mm->mm_preferred[p];
 	for (p--; p >= 0; p--) {
 		c = mm->mm_preferred[p];
 		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
 			preferred = c;
 	}
 	return (preferred);
 }
 
 static int
 vdev_mirror_preferred_child_randomize(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	int p;
 
 	if (mm->mm_root) {
 		p = random_in_range(mm->mm_preferred_cnt);
 		return (vdev_mirror_dva_select(zio, p));
 	}
 
 	/*
 	 * To ensure we don't always favour the first matching vdev,
 	 * which could lead to wear leveling issues on SSD's, we
 	 * use the I/O offset as a pseudo random seed into the vdevs
 	 * which have the lowest load.
 	 */
 	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
 	return (mm->mm_preferred[p]);
 }
 
 static boolean_t
 vdev_mirror_child_readable(mirror_child_t *mc)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_readable(vd, mc->mc_offset));
 	else
 		return (vdev_readable(vd));
 }
 
 static boolean_t
 vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
 	else
 		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Try to find a vdev whose DTL doesn't contain the block we want to read
  * preferring vdevs based on determined load. If we can't, try the read on
  * any vdev we haven't already tried.
  *
  * Distributed spares are an exception to the above load rule. They are
  * always preferred in order to detect gaps in the distributed spare which
  * are created when another disk in the dRAID fails. In order to restore
  * redundancy those gaps must be read to trigger the required repair IO.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
 	for (c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc;
 
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
 
 		if (mc->mc_vd == NULL ||
 		    !vdev_mirror_child_readable(mc)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_mirror_child_missing(mc, txg, 1)) {
 			mc->mc_error = SET_ERROR(ESTALE);
 			mc->mc_skipped = 1;
 			mc->mc_speculative = 1;
 			continue;
 		}
 
 		if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
 			mm->mm_preferred[0] = c;
 			mm->mm_preferred_cnt = 1;
 			break;
 		}
 
 		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
 		if (mc->mc_load > lowest_load)
 			continue;
 
 		if (mc->mc_load < lowest_load) {
 			lowest_load = mc->mc_load;
 			mm->mm_preferred_cnt = 0;
 		}
 		mm->mm_preferred[mm->mm_preferred_cnt] = c;
 		mm->mm_preferred_cnt++;
 	}
 
 	if (mm->mm_preferred_cnt == 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_found);
 		return (mm->mm_preferred[0]);
 	}
 
 	if (mm->mm_preferred_cnt > 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
 		return (vdev_mirror_preferred_child_randomize(zio));
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
 	for (c = 0; c < mm->mm_children; c++) {
 		if (!mm->mm_child[c].mc_tried)
 			return (c);
 	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
 	 */
 	return (-1);
 }
 
 static void
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
 	mirror_child_t *mc;
 	int c, children;
 
 	mm = vdev_mirror_map_init(zio);
 	zio->io_vsd = mm;
 	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
 
 	if (mm == NULL) {
 		ASSERT(!spa_trust_config(zio->io_spa));
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		zio_execute(zio);
 		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
 			 * For scrubbing reads we need to issue reads to all
 			 * children.  One child can reuse parent buffer, but
 			 * for others we have to allocate separate ones to
 			 * verify checksums if io_bp is non-NULL, or compare
 			 * them in vdev_mirror_io_done() otherwise.
 			 */
 			boolean_t first = B_TRUE;
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
 
 				/* Don't issue ZIOs to offline children */
 				if (!vdev_mirror_child_readable(mc)) {
 					mc->mc_error = SET_ERROR(ENXIO);
 					mc->mc_tried = 1;
 					mc->mc_skipped = 1;
 					continue;
 				}
 
 				mc->mc_abd = first ? zio->io_abd :
 				    abd_alloc_sametype(zio->io_abd,
 				    zio->io_size);
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 				    mc->mc_vd, mc->mc_offset, mc->mc_abd,
 				    zio->io_size, zio->io_type,
 				    zio->io_priority, 0,
 				    vdev_mirror_child_done, mc));
 				first = B_FALSE;
 			}
 			zio_execute(zio);
 			return;
 		}
 		/*
 		 * For normal reads just pick one child.
 		 */
 		c = vdev_mirror_child_select(zio);
 		children = (c >= 0);
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
 		 * Writes go to all children.
 		 */
 		c = 0;
 		children = mm->mm_children;
 	}
 
 	while (children--) {
 		mc = &mm->mm_child[c];
 		c++;
 
 		/*
 		 * When sequentially resilvering only issue write repair
 		 * IOs to the vdev which is being rebuilt since performance
 		 * is limited by the slowest child.  This is an issue for
 		 * faster replacement devices such as distributed spares.
 		 */
 		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    mm->mm_rebuilding && !mc->mc_rebuilding) {
 			continue;
 		}
 
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 	}
 
 	zio_execute(zio);
 }
 
 static int
 vdev_mirror_worst_error(mirror_map_t *mm)
 {
 	int error[2] = { 0, 0 };
 
 	for (int c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc = &mm->mm_child[c];
 		int s = mc->mc_speculative;
 		error[s] = zio_worst_error(error[s], mc->mc_error);
 	}
 
 	return (error[0] ? error[0] : error[1]);
 }
 
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	mirror_child_t *mc;
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
 	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
 
 	for (c = 0; c < mm->mm_children; c++) {
 		mc = &mm->mm_child[c];
 
 		if (mc->mc_error) {
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
 			last_good_copy = c;
 			good_copies++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
 		 *
 		 * Now that we support write reallocation, it would be better
 		 * to treat partial failure as real failure unless there are
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
 			 *
 			 * For ditto blocks (io_vd == NULL), require
 			 * all copies to be good.
 			 *
 			 * XXX -- for replacing vdevs, there's no great answer.
 			 * If the old device is really dead, we may not even
 			 * be able to access it -- so we only want to
 			 * require good writes to the new device.  But if
 			 * the new device turns out to be flaky, we want
 			 * to be able to detach it -- which requires all
 			 * writes to the old device to have succeeded.
 			 */
 			if (good_copies == 0 || zio->io_vd == NULL)
 				zio->io_error = vdev_mirror_worst_error(mm);
 		}
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level Mirror VDEV.
 	 *
 	 * There will be no attampt at reading any additional data copies. If
 	 * the buffer is still being manipulated while attempting to read from
 	 * another child, there exists a possibly that the checksum could be
 	 * verified as valid. However, the buffer contents could again get
 	 * manipulated after verifying the checksum. This would lead to bad data
 	 * being written out during self healing.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
 	    (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 		zio_dio_chksum_verify_error_report(zio);
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT3U(zio->io_error, ==, ECKSUM);
 		return;
 	}
 
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
 		zio_vdev_io_redone(zio);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 		return;
 	}
 
 	if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
 		abd_t *best_abd = NULL;
 		if (last_good_copy >= 0)
 			best_abd = mm->mm_child[last_good_copy].mc_abd;
 
 		/*
 		 * If we're scrubbing but don't have a BP available (because
 		 * this vdev is under a raidz or draid vdev) then the best we
 		 * can do is compare all of the copies read.  If they're not
 		 * identical then return a checksum error and the most likely
 		 * correct data.  The raidz code will issue a repair I/O if
 		 * possible.
 		 */
 		if (zio->io_bp == NULL) {
 			ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
 			    zio->io_vd->vdev_ops == &vdev_spare_ops);
 
 			abd_t *pref_abd = NULL;
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 
 				if (abd_cmp(mc->mc_abd, best_abd) != 0)
 					zio->io_error = SET_ERROR(ECKSUM);
 
 				/*
 				 * The distributed spare is always prefered
 				 * by vdev_mirror_child_select() so it's
 				 * considered to be the best candidate.
 				 */
 				if (pref_abd == NULL &&
 				    mc->mc_vd->vdev_ops ==
 				    &vdev_draid_spare_ops)
 					pref_abd = mc->mc_abd;
 
 				/*
 				 * In the absence of a preferred copy, use
 				 * the parent pointer to avoid a memory copy.
 				 */
 				if (mc->mc_abd == zio->io_abd)
 					best_abd = mc->mc_abd;
 			}
 			if (pref_abd)
 				best_abd = pref_abd;
 		} else {
 
 			/*
 			 * If we have a BP available, then checksums are
 			 * already verified and we just need a buffer
 			 * with valid data, preferring parent one to
 			 * avoid a memory copy.
 			 */
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 				if (mc->mc_abd == zio->io_abd) {
 					best_abd = mc->mc_abd;
 					break;
 				}
 			}
 		}
 
 		if (best_abd && best_abd != zio->io_abd)
 			abd_copy(zio->io_abd, best_abd, zio->io_size);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			if (mc->mc_abd != zio->io_abd)
 				abd_free(mc->mc_abd);
 			mc->mc_abd = NULL;
 		}
 	}
 
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
 	}
 
 	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
 			 * Not only is it unnecessary, it could
 			 * actually be harmful: if the system lost
 			 * power while rewriting the only good copy,
 			 * there would be no good copies left!
 			 */
 			mc = &mm->mm_child[c];
 
 			if (mc->mc_error == 0) {
 				vdev_ops_t *ops = mc->mc_vd->vdev_ops;
 
 				if (mc->mc_tried)
 					continue;
 				/*
 				 * We didn't try this child.  We need to
 				 * repair it if:
 				 * 1. it's a scrub (in which case we have
 				 * tried everything that was healthy)
 				 *  - or -
 				 * 2. it's an indirect or distributed spare
 				 * vdev (in which case it could point to any
 				 * other vdev, which might have a bad DTL)
 				 *  - or -
 				 * 3. the DTL indicates that this data is
 				 * missing from this vdev
 				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
 				    ops != &vdev_indirect_ops &&
 				    ops != &vdev_draid_spare_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = SET_ERROR(ESTALE);
 			}
 
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
 
 static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted == vd->vdev_children) {
 		if (vdev_children_are_offline(vd)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
 			    VDEV_AUX_CHILDREN_OFFLINE);
 		} else {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_NO_REPLICAS);
 		}
 	} else if (degraded + faulted != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range.
  */
 static uint64_t
 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	(void) start;
 
 	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	return (MIN(asize, vdev_psize_to_asize(vd, psize)));
 }
 
 vdev_ops_t vdev_mirror_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_replacing_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_spare_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
 	"Rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
 	ZMOD_RW, "Rotating media load increment for seeking I/Os");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
 	ZMOD_RW,
 	"Offset in bytes from the last I/O which triggers "
 	"a reduced rotating media seek increment");
-/* END CSTYLED */
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for seeking I/Os");
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 045395549577..e4487c485075 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1,5124 +1,5122 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_tx.h>
 #include <sys/abd.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/dsl_scan.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 
 /*
  * Big Theory Statement for how a RAIDZ VDEV is expanded
  *
  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
  * that have been previously expanded can be expanded again.
  *
  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
  * the VDEV) when an expansion starts.  And the expansion will pause if any
  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
  * operations on the pool can continue while an expansion is in progress (e.g.
  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
  * and zpool initialize which can't be run during an expansion.  Following a
  * reboot or export/import, the expansion resumes where it left off.
  *
  * == Reflowing the Data ==
  *
  * The expansion involves reflowing (copying) the data from the current set
  * of disks to spread it across the new set which now has one more disk. This
  * reflow operation is similar to reflowing text when the column width of a
  * text editor window is expanded. The text doesn’t change but the location of
  * the text changes to accommodate the new width. An example reflow result for
  * a 4-wide RAIDZ1 to a 5-wide is shown below.
  *
  *                            Reflow End State
  *            Each letter indicates a parity group (logical stripe)
  *
  *         Before expansion                         After Expansion
  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
  *  +------+------+------+------+         +------+------+------+------+------+
  *
  * This reflow approach has several advantages. There is no need to read or
  * modify the block pointers or recompute any block checksums.  The reflow
  * doesn’t need to know where the parity sectors reside. We can read and write
  * data sequentially and the copy can occur in a background thread in open
  * context. The design also allows for fast discovery of what data to copy.
  *
  * The VDEV metaslabs are processed, one at a time, to copy the block data to
  * have it flow across all the disks. The metaslab is disabled for allocations
  * during the copy. As an optimization, we only copy the allocated data which
  * can be determined by looking at the metaslab range tree. During the copy we
  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
  * need to be able to survive losing parity count disks).  This means we
  * cannot overwrite data during the reflow that would be needed if a disk is
  * lost.
  *
  * After the reflow completes, all newly-written blocks will have the new
  * layout, i.e., they will have the parity to data ratio implied by the new
  * number of disks in the RAIDZ group.  Even though the reflow copies all of
  * the allocated space (data and parity), it is only rearranged, not changed.
  *
  * This act of reflowing the data has a few implications about blocks
  * that were written before the reflow completes:
  *
  *  - Old blocks will still use the same amount of space (i.e., they will have
  *    the parity to data ratio implied by the old number of disks in the RAIDZ
  *    group).
  *  - Reading old blocks will be slightly slower than before the reflow, for
  *    two reasons. First, we will have to read from all disks in the RAIDZ
  *    VDEV, rather than being able to skip the children that contain only
  *    parity of this block (because the data of a single block is now spread
  *    out across all the disks).  Second, in most cases there will be an extra
  *    bcopy, needed to rearrange the data back to its original layout in memory.
  *
  * == Scratch Area ==
  *
  * As we copy the block data, we can only progress to the point that writes
  * will not overlap with blocks whose progress has not yet been recorded on
  * disk.  Since partially-copied rows are always read from the old location,
  * we need to stop one row before the sector-wise overlap, to prevent any
  * row-wise overlap. For example, in the diagram above, when we reflow sector
  * B6 it will overwite the original location for B5.
  *
  * To get around this, a scratch space is used so that we can start copying
  * without risking data loss by overlapping the row. As an added benefit, it
  * improves performance at the beginning of the reflow, but that small perf
  * boost wouldn't be worth the complexity on its own.
  *
  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
  * the widths will likely be single digits so we can get a substantial chuck
  * size using only a few MB of scratch per disk.
  *
  * The scratch area is persisted to disk which holds a large amount of reflowed
  * state. We can always read the partially written stripes when a disk fails or
  * the copy is interrupted (crash) during the initial copying phase and also
  * get past a small chunk size restriction.  At a minimum, the scratch space
  * must be large enough to get us to the point that one row does not overlap
  * itself when moved (i.e new_width^2).  But going larger is even better. We
  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
  * as our scratch space to handle overwriting the initial part of the VDEV.
  *
  *	0     256K   512K                    4M
  *	+------+------+-----------------------+-----------------------------
  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
  *	+------+------+-----------------------+-------------------------------
  *                        Scratch Area
  *
  * == Reflow Progress Updates ==
  * After the initial scratch-based reflow, the expansion process works
  * similarly to device removal. We create a new open context thread which
  * reflows the data, and periodically kicks off sync tasks to update logical
  * state. In this case, state is the committed progress (offset of next data
  * to copy). We need to persist the completed offset on disk, so that if we
  * crash we know which format each VDEV offset is in.
  *
  * == Time Dependent Geometry ==
  *
  * In non-expanded RAIDZ, blocks are read from disk in a column by column
  * fashion. For a multi-row block, the second sector is in the first column
  * not in the second column. This allows us to issue full reads for each
  * column directly into the request buffer. The block data is thus laid out
  * sequentially in a column-by-column fashion.
  *
  * For example, in the before expansion diagram above, one logical block might
  * be sectors G19-H26. The parity is in G19,H23; and the data is in
  * G20,H24,G21,H25,G22,H26.
  *
  * After a block is reflowed, the sectors that were all in the original column
  * data can now reside in different columns. When reading from an expanded
  * VDEV, we need to know the logical stripe width for each block so we can
  * reconstitute the block’s data after the reads are completed. Likewise,
  * when we perform the combinatorial reconstruction we need to know the
  * original width so we can retry combinations from the past layouts.
  *
  * Time dependent geometry is what we call having blocks with different layouts
  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
  * block’s birth time (+ the time expansion ended) to establish the correct
  * width for a given block. After an expansion completes, we record the time
  * for blocks written with a particular width (geometry).
  *
  * == On Disk Format Changes ==
  *
  * New pool feature flag, 'raidz_expansion' whose reference count is the number
  * of RAIDZ VDEVs that have been expanded.
  *
  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
  *
  * Since the uberblock can point to arbitrary blocks, which might be on the
  * expanding RAIDZ, and might or might not have been expanded. We need to know
  * which way a block is laid out before reading it. This info is the next
  * offset that needs to be reflowed and we persist that in the uberblock, in
  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
  * After the expansion is complete, we then use the raidz_expand_txgs array
  * (see below) to determine how to read a block and the ub_raidz_reflow_info
  * field no longer required.
  *
  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
  * state (i.e., active or not) which is also required before reading a block
  * during the initial phase of reflowing the data.
  *
  * The top-level RAIDZ VDEV has two new entries in the nvlist:
  *
  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
  *                            and used after the expansion is complete to
  *                            determine how to read a raidz block
  * 'raidz_expanding' boolean: present during reflow and removed after completion
  *                            used during a spa import to resume an unfinished
  *                            expansion
  *
  * And finally the VDEVs top zap adds the following informational entries:
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
  */
 
 /*
  * For testing only: pause the raidz expansion after reflowing this amount.
  * (accessed by ZTS and ztest)
  */
 #ifdef	_KERNEL
 static
 #endif	/* _KERNEL */
 unsigned long raidz_expand_max_reflow_bytes = 0;
 
 /*
  * For testing only: pause the raidz expansion at a certain point.
  */
 uint_t raidz_expand_pause_point = 0;
 
 /*
  * Maximum amount of copy io's outstanding at once.
  */
 #ifdef _ILP32
 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
 #else
 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Apply raidz map abds aggregation if the number of rows in the map is equal
  * or greater than the value below.
  */
 static unsigned long raidz_io_aggregate_rows = 4;
 
 /*
  * Automatically start a pool scrub when a RAIDZ expansion completes in
  * order to verify the checksums of all blocks which have been copied
  * during the expansion.  Automatic scrubbing is enabled by default and
  * is strongly recommended.
  */
 static int zfs_scrub_after_expand = 1;
 
 static void
 vdev_raidz_row_free(raidz_row_t *rr)
 {
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size != 0)
 			abd_free(rc->rc_abd);
 		if (rc->rc_orig_data != NULL)
 			abd_free(rc->rc_orig_data);
 	}
 
 	if (rr->rr_abd_empty != NULL)
 		abd_free(rr->rr_abd_empty);
 
 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 }
 
 void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++)
 		vdev_raidz_row_free(rm->rm_row[i]);
 
 	if (rm->rm_nphys_cols) {
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			if (rm->rm_phys_col[i].rc_abd != NULL)
 				abd_free(rm->rm_phys_col[i].rc_abd);
 		}
 
 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 		    rm->rm_nphys_cols);
 	}
 
 	ASSERT3P(rm->rm_lr, ==, NULL);
 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_map_free(rm);
 }
 
 static int
 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 {
 	const reflow_node_t *l = x1;
 	const reflow_node_t *r = x2;
 
 	return (TREE_CMP(l->re_txg, r->re_txg));
 }
 
 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	.vsd_free = vdev_raidz_map_free_vsd,
 };
 
 raidz_row_t *
 vdev_raidz_row_alloc(int cols, zio_t *zio)
 {
 	raidz_row_t *rr =
 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 
 	rr->rr_cols = cols;
 	rr->rr_scols = cols;
 
 	for (int c = 0; c < cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_shadow_devidx = INT_MAX;
 		rc->rc_shadow_offset = UINT64_MAX;
 		/*
 		 * We can not allow self healing to take place for Direct I/O
 		 * reads. There is nothing that stops the buffer contents from
 		 * being manipulated while the I/O is in flight. It is possible
 		 * that the checksum could be verified on the buffer and then
 		 * the contents of that buffer are manipulated afterwards. This
 		 * could lead to bad data being written out during self
 		 * healing.
 		 */
 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
 			rc->rc_allow_repair = 1;
 	}
 	return (rr);
 }
 
 static void
 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 {
 	int c;
 	int nwrapped = 0;
 	uint64_t off = 0;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/*
 	 * Pad any parity columns with additional space to account for skip
 	 * sectors.
 	 */
 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
 		ASSERT0(rm->rm_skipstart);
 		nwrapped = rm->rm_nskip;
 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 		nwrapped =
 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 	}
 
 	/*
 	 * Optional single skip sectors (rc_size == 0) will be handled in
 	 * vdev_raidz_io_start_write().
 	 */
 	int skipped = rr->rr_scols - rr->rr_cols;
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Parity columns will pad out a linear ABD to account for
 		 * the skip sector. A linear ABD is used here because
 		 * parity calculations use the ABD buffer directly to calculate
 		 * parity. This avoids doing a memcpy back to the ABD after the
 		 * parity has been calculated. By issuing the parity column
 		 * with the skip sector we can reduce contention on the child
 		 * VDEV queue locks (vq_lock).
 		 */
 		if (c < nwrapped) {
 			rc->rc_abd = abd_alloc_linear(
 			    rc->rc_size + (1ULL << ashift), B_FALSE);
 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 			skipped++;
 		} else {
 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 		}
 	}
 
 	for (off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 
 		/*
 		 * Generate I/O for skip sectors to improve aggregation
 		 * continuity. We will use gang ABD's to reduce contention
 		 * on the child VDEV queue locks (vq_lock) by issuing
 		 * a single I/O that contains the data and skip sector.
 		 *
 		 * It is important to make sure that rc_size is not updated
 		 * even though we are adding a skip sector to the ABD. When
 		 * calculating the parity in vdev_raidz_generate_parity_row()
 		 * the rc_size is used to iterate through the ABD's. We can
 		 * not have zero'd out skip sectors used for calculating
 		 * parity for raidz, because those same sectors are not used
 		 * during reconstruction.
 		 */
 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
 			abd_gang_add(rc->rc_abd,
 			    abd_get_zeros(1ULL << ashift), B_TRUE);
 			skipped++;
 		} else {
 			rc->rc_abd = abd;
 		}
 		off += rc->rc_size;
 	}
 
 	ASSERT3U(off, ==, zio->io_size);
 	ASSERT3S(skipped, ==, rm->rm_nskip);
 }
 
 static void
 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 {
 	int c;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++)
 		rr->rr_col[c].rc_abd =
 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 
 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 		off += rc->rc_size;
 	}
 }
 
 /*
  * Divides the IO evenly across all child vdevs; usually, dcols is
  * the number of children in the target vdev.
  *
  * Avoid inlining the function to keep vdev_raidz_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
     uint64_t nparity)
 {
 	raidz_row_t *rr;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = zio->io_offset >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = zio->io_size >> ashift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << ashift;
 	uint64_t acols, scols;
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 	rm->rm_nrows = 1;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/*
 	 * acols: The columns that will be accessed.
 	 * scols: The columns that will be accessed or skipped.
 	 */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 	rr = vdev_raidz_row_alloc(scols, zio);
 	rm->rm_row[0] = rr;
 	rr->rr_cols = acols;
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = zio->io_offset;
 	rr->rr_size = zio->io_size;
 #endif
 
 	uint64_t asize = 0;
 
 	for (uint64_t c = 0; c < scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		uint64_t col = f + c;
 		uint64_t coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << ashift;
 		}
 		rc->rc_devidx = col;
 		rc->rc_offset = coff;
 
 		if (c >= acols)
 			rc->rc_size = 0;
 		else if (c < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rr->rr_cols >= 2);
 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 		uint64_t devidx = rr->rr_col[0].rc_devidx;
 		o = rr->rr_col[0].rc_offset;
 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 		rr->rr_col[1].rc_devidx = devidx;
 		rr->rr_col[1].rc_offset = o;
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_map_alloc_write(zio, rm, ashift);
 	} else {
 		vdev_raidz_map_alloc_read(zio, rm);
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 /*
  * Everything before reflow_offset_synced should have been moved to the new
  * location (read and write completed).  However, this may not yet be reflected
  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
  * uberblock has not yet been written). If reflow is not in progress,
  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
  * entirely before reflow_offset_synced, it will come from the new location.
  * Otherwise this row will come from the old location.  Therefore, rows that
  * straddle the reflow_offset_synced will come from the old location.
  *
  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
  * been copied, but not yet reflected in the on-disk progress
  * (reflow_offset_synced), it will also be written to the new (already copied)
  * offset.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc_expanded(zio_t *zio,
     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
     uint64_t nparity, uint64_t reflow_offset_synced,
     uint64_t reflow_offset_next, boolean_t use_scratch)
 {
 	abd_t *abd = zio->io_abd;
 	uint64_t offset = zio->io_offset;
 	uint64_t size = zio->io_size;
 
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 * AKA "full rows"
 	 */
 	uint64_t q = s / (logical_cols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (logical_cols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* How many rows contain data (not skip) */
 	uint64_t rows = howmany(tot, logical_cols);
 	int cols = MIN(tot, logical_cols);
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 	    KM_SLEEP);
 	rm->rm_nrows = rows;
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 	uint64_t asize = 0;
 
 	for (uint64_t row = 0; row < rows; row++) {
 		boolean_t row_use_scratch = B_FALSE;
 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
 		rm->rm_row[row] = rr;
 
 		/* The starting RAIDZ (parent) vdev sector of the row. */
 		uint64_t b = (offset >> ashift) + row * logical_cols;
 
 		/*
 		 * If we are in the middle of a reflow, and the copying has
 		 * not yet completed for any part of this row, then use the
 		 * old location of this row.  Note that reflow_offset_synced
 		 * reflects the i/o that's been completed, because it's
 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
 		 * This is sufficient for our check, even if that progress
 		 * has not yet been recorded to disk (reflected in
 		 * spa_ubsync).  Also note that we consider the last row to
 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
 		 * this calculation. This causes a tiny bit of unnecessary
 		 * double-writes but is safe and simpler to calculate.
 		 */
 		int row_phys_cols = physical_cols;
 		if (b + cols > reflow_offset_synced >> ashift)
 			row_phys_cols--;
 		else if (use_scratch)
 			row_use_scratch = B_TRUE;
 
 		/* starting child of this row */
 		uint64_t child_id = b % row_phys_cols;
 		/* The starting byte offset on each child vdev. */
 		uint64_t child_offset = (b / row_phys_cols) << ashift;
 
 		/*
 		 * Note, rr_cols is the entire width of the block, even
 		 * if this row is shorter.  This is needed because parity
 		 * generation (for Q and R) needs to know the entire width,
 		 * because it treats the short row as though it was
 		 * full-width (and the "phantom" sectors were zero-filled).
 		 *
 		 * Another approach to this would be to set cols shorter
 		 * (to just the number of columns that we might do i/o to)
 		 * and have another mechanism to tell the parity generation
 		 * about the "entire width".  Reconstruction (at least
 		 * vdev_raidz_reconstruct_general()) would also need to
 		 * know about the "entire width".
 		 */
 		rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 		/*
 		 * note: rr_size is PSIZE, not ASIZE
 		 */
 		rr->rr_offset = b << ashift;
 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 #endif
 
 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 			if (child_id >= row_phys_cols) {
 				child_id -= row_phys_cols;
 				child_offset += 1ULL << ashift;
 			}
 			raidz_col_t *rc = &rr->rr_col[c];
 			rc->rc_devidx = child_id;
 			rc->rc_offset = child_offset;
 
 			/*
 			 * Get this from the scratch space if appropriate.
 			 * This only happens if we crashed in the middle of
 			 * raidz_reflow_scratch_sync() (while it's running,
 			 * the rangelock prevents us from doing concurrent
 			 * io), and even then only during zpool import or
 			 * when the pool is imported readonly.
 			 */
 			if (row_use_scratch)
 				rc->rc_offset -= VDEV_BOOT_SIZE;
 
 			uint64_t dc = c - rr->rr_firstdatacol;
 			if (c < rr->rr_firstdatacol) {
 				rc->rc_size = 1ULL << ashift;
 
 				/*
 				 * Parity sectors' rc_abd's are set below
 				 * after determining if this is an aggregation.
 				 */
 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
 				/*
 				 * Past the end of the block (even including
 				 * skip sectors).  This sector is part of the
 				 * map so that we have full rows for p/q parity
 				 * generation.
 				 */
 				rc->rc_size = 0;
 				rc->rc_abd = NULL;
 			} else {
 				/* "data column" (col excluding parity) */
 				uint64_t off;
 
 				if (c < bc || r == 0) {
 					off = dc * rows + row;
 				} else {
 					off = r * rows +
 					    (dc - r) * (rows - 1) + row;
 				}
 				rc->rc_size = 1ULL << ashift;
 				rc->rc_abd = abd_get_offset_struct(
 				    &rc->rc_abdstruct, abd, off << ashift,
 				    rc->rc_size);
 			}
 
 			if (rc->rc_size == 0)
 				continue;
 
 			/*
 			 * If any part of this row is in both old and new
 			 * locations, the primary location is the old
 			 * location. If this sector was already copied to the
 			 * new location, we need to also write to the new,
 			 * "shadow" location.
 			 *
 			 * Note, `row_phys_cols != physical_cols` indicates
 			 * that the primary location is the old location.
 			 * `b+c < reflow_offset_next` indicates that the copy
 			 * to the new location has been initiated. We know
 			 * that the copy has completed because we have the
 			 * rangelock, which is held exclusively while the
 			 * copy is in progress.
 			 */
 			if (row_use_scratch ||
 			    (row_phys_cols != physical_cols &&
 			    b + c < reflow_offset_next >> ashift)) {
 				rc->rc_shadow_devidx = (b + c) % physical_cols;
 				rc->rc_shadow_offset =
 				    ((b + c) / physical_cols) << ashift;
 				if (row_use_scratch)
 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 			}
 
 			asize += rc->rc_size;
 		}
 
 		/*
 		 * See comment in vdev_raidz_map_alloc()
 		 */
 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 		    (offset & (1ULL << 20))) {
 			ASSERT(rr->rr_cols >= 2);
 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 			int devidx0 = rr->rr_col[0].rc_devidx;
 			uint64_t offset0 = rr->rr_col[0].rc_offset;
 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 			uint64_t shadow_offset0 =
 			    rr->rr_col[0].rc_shadow_offset;
 
 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 			rr->rr_col[0].rc_shadow_devidx =
 			    rr->rr_col[1].rc_shadow_devidx;
 			rr->rr_col[0].rc_shadow_offset =
 			    rr->rr_col[1].rc_shadow_offset;
 
 			rr->rr_col[1].rc_devidx = devidx0;
 			rr->rr_col[1].rc_offset = offset0;
 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 		}
 	}
 	ASSERT3U(asize, ==, tot << ashift);
 
 	/*
 	 * Determine if the block is contiguous, in which case we can use
 	 * an aggregation.
 	 */
 	if (rows >= raidz_io_aggregate_rows) {
 		rm->rm_nphys_cols = physical_cols;
 		rm->rm_phys_col =
 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 		    KM_SLEEP);
 
 		/*
 		 * Determine the aggregate io's offset and size, and check
 		 * that the io is contiguous.
 		 */
 		for (int i = 0;
 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 
 				if (rc->rc_size == 0)
 					continue;
 
 				if (prc->rc_size == 0) {
 					ASSERT0(prc->rc_offset);
 					prc->rc_offset = rc->rc_offset;
 				} else if (prc->rc_offset + prc->rc_size !=
 				    rc->rc_offset) {
 					/*
 					 * This block is not contiguous and
 					 * therefore can't be aggregated.
 					 * This is expected to be rare, so
 					 * the cost of allocating and then
 					 * freeing rm_phys_col is not
 					 * significant.
 					 */
 					kmem_free(rm->rm_phys_col,
 					    sizeof (raidz_col_t) *
 					    rm->rm_nphys_cols);
 					rm->rm_phys_col = NULL;
 					rm->rm_nphys_cols = 0;
 					break;
 				}
 				prc->rc_size += rc->rc_size;
 			}
 		}
 	}
 	if (rm->rm_phys_col != NULL) {
 		/*
 		 * Allocate aggregate ABD's.
 		 */
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			raidz_col_t *prc = &rm->rm_phys_col[i];
 
 			prc->rc_devidx = i;
 
 			if (prc->rc_size == 0)
 				continue;
 
 			prc->rc_abd =
 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
 			    B_FALSE);
 		}
 
 		/*
 		 * Point the parity abd's into the aggregate abd's.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 				rc->rc_abd =
 				    abd_get_offset_struct(&rc->rc_abdstruct,
 				    prc->rc_abd,
 				    rc->rc_offset - prc->rc_offset,
 				    rc->rc_size);
 			}
 		}
 	} else {
 		/*
 		 * Allocate new abd's for the parity sectors.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				rc->rc_abd =
 				    abd_alloc_linear(rc->rc_size,
 				    B_TRUE);
 			}
 		}
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 struct pqr_struct {
 	uint64_t *p;
 	uint64_t *q;
 	uint64_t *r;
 };
 
 static int
 vdev_raidz_p_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && !pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
 		*pqr->p ^= *src;
 
 	return (0);
 }
 
 static int
 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 		*pqr->r ^= *src;
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, NULL, NULL };
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, NULL };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pq_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 				r[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, r };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pqr_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 void
 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 {
 	if (rr->rr_cols == 0) {
 		/*
 		 * We are handling this block one row at a time (because
 		 * this block has a different logical vs physical width,
 		 * due to RAIDZ expansion), and this is a pad-only row,
 		 * which has no parity.
 		 */
 		return;
 	}
 
 	/* Generate using the new math implementation */
 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	switch (rr->rr_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rr);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rr);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rr);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		vdev_raidz_generate_parity_row(rm, rr);
 	}
 }
 
 static int
 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	int cnt = size / sizeof (src[0]);
 
 	for (int i = 0; i < cnt; i++) {
 		dst[i] ^= src[i];
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
     void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, src++) {
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 		*dst ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++) {
 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 	}
 
 	return (0);
 }
 
 struct reconst_q_struct {
 	uint64_t *q;
 	int exp;
 };
 
 static int
 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 {
 	struct reconst_q_struct *rq = private;
 	uint64_t *dst = buf;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 		int j;
 		uint8_t *b;
 
 		*dst ^= *rq->q;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, rq->exp);
 		}
 	}
 
 	return (0);
 }
 
 struct reconst_pq_struct {
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *pxy;
 	uint8_t *qxy;
 	int aexp;
 	int bexp;
 };
 
 static int
 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 	uint8_t *yd = ybuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
 
 	ASSERT3U(ntgts, ==, 1);
 	ASSERT3U(x, >=, rr->rr_firstdatacol);
 	ASSERT3U(x, <, rr->rr_cols);
 
 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 
 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 
 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 
 		if (c == x)
 			continue;
 
 		(void) abd_iterate_func2(dst, src, 0, 0, size,
 		    vdev_raidz_reconst_p_func, NULL);
 	}
 }
 
 static void
 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	int c, exp;
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
 
 	ASSERT(ntgts == 1);
 
 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 		dst = rr->rr_col[x].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy(dst, src, size);
 			if (rr->rr_col[x].rc_size > size) {
 				abd_zero_off(dst, size,
 				    rr->rr_col[x].rc_size - size);
 			}
 		} else {
 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 			(void) abd_iterate_func2(dst, src, 0, 0, size,
 			    vdev_raidz_reconst_q_pre_func, NULL);
 			(void) abd_iterate_func(dst,
 			    size, rr->rr_col[x].rc_size - size,
 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 	exp = 255 - (rr->rr_cols - 1 - x);
 
 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 	    vdev_raidz_reconst_q_post_func, &rq);
 }
 
 static void
 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 	abd_t *pdata, *qdata;
 	uint64_t xsize, ysize;
 	int x = tgts[0];
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rr->rr_firstdatacol);
 	ASSERT(y < rr->rr_cols);
 
 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	xsize = rr->rr_col[x].rc_size;
 	ysize = rr->rr_col[y].rc_size;
 
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 	rr->rr_col[x].rc_size = 0;
 	rr->rr_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rr);
 
 	rr->rr_col[x].rc_size = xsize;
 	rr->rr_col[y].rc_size = ysize;
 
 	p = abd_to_buf(pdata);
 	q = abd_to_buf(qdata);
 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	xd = rr->rr_col[x].rc_abd;
 	yd = rr->rr_col[y].rc_abd;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	ASSERT3U(xsize, >=, ysize);
 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 
 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 	    vdev_raidz_reconst_pq_func, &rpq);
 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
 	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 }
 
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coefficients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coefficients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *           |  0   1   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 
 static void
 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
 			jj = used[j] - rr->rr_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
 	uint8_t log = 0;
 	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rr->rr_cols);
 
 		ccount = rr->rr_col[c].rc_size;
 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
 		if (ccount == 0)
 			continue;
 		src = abd_to_buf(rr->rr_col[c].rc_abd);
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rr->rr_firstdatacol;
 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
 			ASSERT3U(cc, <, rr->rr_cols);
 			ASSERT3U(cc, !=, c);
 
 			dcount[j] = rr->rr_col[cc].rc_size;
 			if (dcount[j] != 0)
 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
 		}
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int i, c, t, tt;
 	unsigned int n;
 	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	abd_t **bufs = NULL;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
 	/*
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs if any non-linear ABDs are found.
 	 */
 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
 		ASSERT(rr->rr_col[i].rc_abd != NULL);
 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
 			    KM_PUSHPAGE);
 
 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 				raidz_col_t *col = &rr->rr_col[c];
 
 				bufs[c] = col->rc_abd;
 				if (bufs[c] != NULL) {
 					col->rc_abd = abd_alloc_linear(
 					    col->rc_size, B_TRUE);
 					abd_copy(col->rc_abd, bufs[c],
 					    col->rc_size);
 				}
 			}
 
 			break;
 		}
 	}
 
 	n = rr->rr_cols - rr->rr_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rr->rr_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rr->rr_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rr->rr_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	/*
 	 * copy back from temporary linear abds and free them
 	 */
 	if (bufs) {
 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *col = &rr->rr_col[c];
 
 			if (bufs[c] != NULL) {
 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
 				abd_free(col->rc_abd);
 			}
 			col->rc_abd = bufs[c];
 		}
 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
 	}
 }
 
 static void
 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
     const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c, ret;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
 		    (int)rr->rr_missingparity);
 	}
 
 	nbadparity = rr->rr_firstdatacol;
 	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
 			    "offset=%llx error=%u)",
 			    rr, c, (int)rr->rr_col[c].rc_devidx,
 			    (long long)rr->rr_col[c].rc_offset,
 			    (int)rr->rr_col[c].rc_error);
 		}
 		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rr->rr_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rr->rr_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/* Reconstruct using the new math implementation */
 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
 	if (ret != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	switch (nbaddata) {
 	case 1:
 		if (parity_valid[VDEV_RAIDZ_P]) {
 			vdev_raidz_reconstruct_p(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_q(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 		break;
 
 	case 2:
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_P] &&
 		    parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_pq(rr, dt, 2);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 
 		break;
 	}
 
 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t nparity = vdrz->vd_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (vd->vdev_rz_expanding) {
 		*asize *= vd->vdev_children - 1;
 		*max_asize *= vd->vdev_children - 1;
 
 		vd->vdev_min_asize = *asize;
 	} else {
 		*asize *= vd->vdev_children;
 		*max_asize *= vd->vdev_children;
 	}
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the logical width to use, given the txg in which the allocation
  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
  * removal, see remap_blkptr_cb()), will have a more recent physical birth
  * which reflects when the BP was relocated, but we can ignore these because
  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
 {
 	reflow_node_t lookup = {
 		.re_txg = txg,
 	};
 	avl_index_t where;
 
 	uint64_t width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
 	if (re != NULL) {
 		width = re->re_logical_width;
 	} else {
 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
 		if (re != NULL)
 			width = re->re_logical_width;
 		else
 			width = vdrz->vd_original_width;
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 	return (width);
 }
 
 /*
  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
  * more space due to the lower data-to-parity ratio.  In this case it's
  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
  * regardless of txg.  This is assured because for a single data sector, we
  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
  */
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
 	cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 #ifdef ZFS_DEBUG
 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
 	uint64_t ncols_new = vdrz->vd_physical_width;
 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
 	    (ncols_new - nparity));
 	asize_new = roundup(asize_new, nparity + 1) << ashift;
 	VERIFY3U(asize_new, <=, asize);
 #endif
 
 	return (asize);
 }
 
 /*
  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
  * so each child must provide at least 1/Nth of its asize.
  */
 static uint64_t
 vdev_raidz_min_asize(vdev_t *vd)
 {
 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
 	    vd->vdev_children);
 }
 
 void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	ASSERT3P(rc->rc_abd, !=, NULL);
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 static void
 vdev_raidz_shadow_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_shadow_error = zio->io_error;
 }
 
 static void
 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 {
 	(void) rm;
 #ifdef ZFS_DEBUG
 	range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
 	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	if (vdev_xlate_is_empty(&physical_rs)) {
 		/*
 		 * If we are in the middle of expansion, the
 		 * physical->logical mapping is changing so vdev_xlate()
 		 * can't give us a reliable answer.
 		 */
 		return;
 	}
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	/*
 	 * It would be nice to assert that rs_end is equal
 	 * to rc_offset + rc_size but there might be an
 	 * optional I/O at the end that is not accounted in
 	 * rc_size.
 	 */
 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
 	}
 #endif
 }
 
 static void
 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		/* Verify physical to logical translation */
 		vdev_raidz_io_verify(zio, rm, rr, c);
 
 		if (rc->rc_size == 0)
 			continue;
 
 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    rc->rc_offset, rc->rc_abd,
 		    abd_get_size(rc->rc_abd), zio->io_type,
 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
 
 		if (rc->rc_shadow_devidx != INT_MAX) {
 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
 
 			ASSERT3U(
 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
 			    rc->rc_shadow_offset, rc->rc_abd,
 			    abd_get_size(rc->rc_abd),
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_shadow_child_done, rc));
 		}
 	}
 }
 
 /*
  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
  * This only works for vdev_raidz_map_alloc() (not _expanded()).
  */
 static void
 raidz_start_skip_writes(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	raidz_map_t *rm = zio->io_vsd;
 	ASSERT3U(rm->rm_nrows, ==, 1);
 	raidz_row_t *rr = rm->rm_row[0];
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (rc->rc_size != 0)
 			continue;
 		ASSERT3P(rc->rc_abd, ==, NULL);
 
 		ASSERT3U(rc->rc_offset, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 	}
 }
 
 static void
 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_size == 0)
 			continue;
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 static void
 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 {
 	vdev_t *vd = zio->io_vd;
 
 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
 		raidz_col_t *prc = &rm->rm_phys_col[i];
 		if (prc->rc_size == 0)
 			continue;
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
 			prc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			prc->rc_error = SET_ERROR(ESTALE);
 			prc->rc_skipped = 1;
 			continue;
 		}
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, prc));
 	}
 }
 
 static void
 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
 {
 	/*
 	 * If there are multiple rows, we will be hitting
 	 * all disks, so go ahead and read the parity so
 	 * that we are reading in decent size chunks.
 	 */
 	boolean_t forceparity = rm->rm_nrows > 1;
 
 	if (rm->rm_phys_col) {
 		vdev_raidz_io_start_read_phys_cols(zio, rm);
 	} else {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
 		}
 	}
 }
 
 /*
  * Start an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Generate the parity data
  *   2. Create child zio write operations to each column's vdev, for both
  *      data and parity.
  *   3. If the column skips any sectors for padding, create optional dummy
  *      write zio children for those areas to improve aggregation continuity.
  * - For read operations:
  *   1. Create child zio read operations to each data column's vdev to read
  *      the range of data required for zio.
  *   2. If this is a scrub or resilver operation, or if any of the data
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
 	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
 		uint64_t next_offset = UINT64_MAX;
 		boolean_t use_scratch = B_FALSE;
 		/*
 		 * Note: when the expansion is completing, we set
 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
 		 * in a later txg than when we last update spa_ubsync's state
 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
 		 * may see vre_state!=SCANNING before
 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
 		 * on disk, but the copying progress has been synced to disk
 		 * (and reflected in spa_ubsync).  In this case it's fine to
 		 * treat the expansion as completed, since if we crash there's
 		 * no additional copying to do.
 		 */
 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
 			    &vdrz->vn_vre);
 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
 			    zio->io_offset, zio->io_size, RL_READER);
 			use_scratch =
 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
 			    RRSS_SCRATCH_VALID);
 			synced_offset =
 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
 			next_offset = vdrz->vn_vre.vre_offset;
 			/*
 			 * If we haven't resumed expanding since importing the
 			 * pool, vre_offset won't have been set yet.  In
 			 * this case the next offset to be copied is the same
 			 * as what was synced.
 			 */
 			if (next_offset == UINT64_MAX) {
 				next_offset = synced_offset;
 			}
 		}
 		if (use_scratch) {
 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
 			    "%lld next_offset=%lld use_scratch=%u",
 			    zio,
 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
 			    (long long)zio->io_offset,
 			    (long long)synced_offset,
 			    (long long)next_offset,
 			    use_scratch);
 		}
 
 		rm = vdev_raidz_map_alloc_expanded(zio,
 		    tvd->vdev_ashift, vdrz->vd_physical_width,
 		    logical_width, vdrz->vd_nparity,
 		    synced_offset, next_offset, use_scratch);
 		rm->rm_lr = lr;
 	} else {
 		rm = vdev_raidz_map_alloc(zio,
 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
 	}
 	rm->rm_original_width = vdrz->vd_original_width;
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
 		}
 
 		if (logical_width == vdrz->vd_physical_width) {
 			raidz_start_skip_writes(zio);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		vdev_raidz_io_start_read(zio, rm);
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 void
 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
 		    rc->rc_abd, bad_data, &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level RAIDZ VDEV.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		zio_checksum_verified(zio);
 		return (0);
 	}
 
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number of such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 {
 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		orig[c] = rc->rc_abd;
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Verify any empty sectors are zero filled to ensure the parity
 	 * is calculated correctly even if these non-data sectors are damaged.
 	 */
 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
 		ret += vdev_draid_map_verify_empty(zio, rr);
 
 	/*
 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
 	 * isn't harmful but it does have the side effect of fixing stuff
 	 * we didn't realize was necessary (i.e. even if we return 0).
 	 */
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
 			vdev_raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		abd_free(orig[c]);
 	}
 
 	return (ret);
 }
 
 static int
 vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
 	}
 
 	return (error);
 }
 
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 
 		if (rc->rc_force_repair)
 			unexpected_errors++;
 	}
 
 	/*
 	 * If we read more parity disks than were used for
 	 * reconstruction, confirm that the other parity disks produced
 	 * correct data.
 	 *
 	 * Note that we also regenerate parity when resilvering so we
 	 * can write it out to failed devices later.
 	 */
 	if (parity_errors + parity_untried <
 	    rr->rr_firstdatacol - data_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
 		unexpected_errors += n;
 	}
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (!rc->rc_allow_repair) {
 				continue;
 			} else if (!rc->rc_force_repair &&
 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
 				continue;
 			}
 			/*
 			 * We do not allow self healing for Direct I/O reads.
 			 * See comment in vdev_raid_row_alloc().
 			 */
 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
 
 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
 			    "offset=%llx",
 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 
 	/*
 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
 	 * good data.  This ensures that if we've already copied this sector,
 	 * it will be corrected if it was damaged.  This writes more than is
 	 * necessary, but since expansion is paused during scrub/resilver, at
 	 * most a single row will have a shadow location.
 	 */
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 
 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
 				continue;
 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
 
 			/*
 			 * Note: We don't want to update the repair stats
 			 * because that would incorrectly indicate that there
 			 * was bad data to repair, which we aren't sure about.
 			 * By clearing the SCAN_THREAD flag, we prevent this
 			 * from happening, despite having the REPAIR flag set.
 			 * We need to set SELF_HEAL so that this i/o can't be
 			 * bypassed by zio_vdev_io_start().
 			 */
 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL);
 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
 			zio_nowait(cio);
 		}
 	}
 }
 
 static void
 raidz_restore_orig_data(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_need_orig_restore) {
 				abd_copy(rc->rc_abd,
 				    rc->rc_orig_data, rc->rc_size);
 				rc->rc_need_orig_restore = B_FALSE;
 			}
 		}
 	}
 }
 
 /*
  * During raidz_reconstruct() for expanded VDEV, we need special consideration
  * failure simulations.  See note in raidz_reconstruct() on simulating failure
  * of a pre-expansion device.
  *
  * Treating logical child i as failed, return TRUE if the given column should
  * be treated as failed.  The idea of logical children allows us to imagine
  * that a disk silently failed before a RAIDZ expansion (reads from this disk
  * succeed but return the wrong data).  Since the expansion doesn't verify
  * checksums, the incorrect data will be moved to new locations spread among
  * the children (going diagonally across them).
  *
  * Higher "logical child failures" (values of `i`) indicate these
  * "pre-expansion failures".  The first physical_width values imagine that a
  * current child failed; the next physical_width-1 values imagine that a
  * child failed before the most recent expansion; the next physical_width-2
  * values imagine a child failed in the expansion before that, etc.
  */
 static boolean_t
 raidz_simulate_failure(int physical_width, int original_width, int ashift,
     int i, raidz_col_t *rc)
 {
 	uint64_t sector_id =
 	    physical_width * (rc->rc_offset >> ashift) +
 	    rc->rc_devidx;
 
 	for (int w = physical_width; w >= original_width; w--) {
 		if (i < w) {
 			return (sector_id % w == i);
 		} else {
 			i -= w;
 		}
 	}
 	ASSERT(!"invalid logical child id");
 	return (B_FALSE);
 }
 
 /*
  * returns EINVAL if reconstruction of the block will not be possible
  * returns ECKSUM if this specific reconstruction failed
  * returns 0 on successful reconstruction
  */
 static int
 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
 
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
 	}
 
 	/* Reconstruct each row */
 	for (int r = 0; r < rm->rm_nrows; r++) {
 		raidz_row_t *rr = rm->rm_row[r];
 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
 		int t = 0;
 		int dead = 0;
 		int dead_data = 0;
 
 		if (dbgmsg)
 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			ASSERT0(rc->rc_need_orig_restore);
 			if (rc->rc_error != 0) {
 				dead++;
 				if (c >= nparity)
 					dead_data++;
 				continue;
 			}
 			if (rc->rc_size == 0)
 				continue;
 			for (int lt = 0; lt < ntgts; lt++) {
 				if (raidz_simulate_failure(physical_width,
 				    original_width,
 				    zio->io_vd->vdev_top->vdev_ashift,
 				    ltgts[lt], rc)) {
 					if (rc->rc_orig_data == NULL) {
 						rc->rc_orig_data =
 						    abd_alloc_linear(
 						    rc->rc_size, B_TRUE);
 						abd_copy(rc->rc_orig_data,
 						    rc->rc_abd, rc->rc_size);
 					}
 					rc->rc_need_orig_restore = B_TRUE;
 
 					dead++;
 					if (c >= nparity)
 						dead_data++;
 					/*
 					 * Note: simulating failure of a
 					 * pre-expansion device can hit more
 					 * than one column, in which case we
 					 * might try to simulate more failures
 					 * than can be reconstructed, which is
 					 * also more than the size of my_tgts.
 					 * This check prevents accessing past
 					 * the end of my_tgts.  The "dead >
 					 * nparity" check below will fail this
 					 * reconstruction attempt.
 					 */
 					if (t < VDEV_RAIDZ_MAXPARITY) {
 						my_tgts[t++] = c;
 						if (dbgmsg) {
 							zfs_dbgmsg("simulating "
 							    "failure of col %u "
 							    "devidx %u", c,
 							    (int)rc->rc_devidx);
 						}
 					}
 					break;
 				}
 			}
 		}
 		if (dead > nparity) {
 			/* reconstruction not possible */
 			if (dbgmsg) {
 				zfs_dbgmsg("reconstruction not possible; "
 				    "too many failures");
 			}
 			raidz_restore_orig_data(rm);
 			return (EINVAL);
 		}
 		if (dead_data > 0)
 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
 	}
 
 	/* Check for success */
 	if (raidz_checksum_verify(zio) == 0) {
 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 			return (0);
 
 		/* Reconstruction succeeded - report errors */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				if (rc->rc_need_orig_restore) {
 					/*
 					 * Note: if this is a parity column,
 					 * we don't really know if it's wrong.
 					 * We need to let
 					 * vdev_raidz_io_done_verified() check
 					 * it, and if we set rc_error, it will
 					 * think that it is a "known" error
 					 * that doesn't need to be checked
 					 * or corrected.
 					 */
 					if (rc->rc_error == 0 &&
 					    c >= rr->rr_firstdatacol) {
 						vdev_raidz_checksum_error(zio,
 						    rc, rc->rc_orig_data);
 						rc->rc_error =
 						    SET_ERROR(ECKSUM);
 					}
 					rc->rc_need_orig_restore = B_FALSE;
 				}
 			}
 
 			vdev_raidz_io_done_verified(zio, rr);
 		}
 
 		zio_checksum_verified(zio);
 
 		if (dbgmsg) {
 			zfs_dbgmsg("reconstruction successful "
 			    "(checksum verified)");
 		}
 		return (0);
 	}
 
 	/* Reconstruction failed - restore original data */
 	raidz_restore_orig_data(rm);
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
 		    "failed", zio);
 	}
 	return (ECKSUM);
 }
 
 /*
  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  *
  * The order that we find the various possible combinations of failed
  * disks is dictated by these rules:
  * - Examine each "slot" (the "i" in tgts[i])
  *   - Try to increment this slot (tgts[i] += 1)
  *   - if we can't increment because it runs into the next slot,
  *     reset our slot to the minimum, and examine the next slot
  *
  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
  *  3 columns to reconstruct), we will generate the following sequence:
  *
  *  STATE        ACTION
  *  0 1 2        special case: skip since these are all parity
  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
  *  0   2 3      first slot: increment to 1
  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
  *  0 1     4    first: reset to 0; middle: increment to 2
  *  0   2   4    first: increment to 1
  *    1 2   4    first: reset to 0; middle: increment to 3
  *  0     3 4    first: increment to 1
  *    1   3 4    first: increment to 2
  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
  *  0 1       5  first: reset to 0; middle: increment to 2
  *  0   2     5  first: increment to 1
  *    1 2     5  first: reset to 0; middle: increment to 3
  *  0     3   5  first: increment to 1
  *    1   3   5  first: increment to 2
  *      2 3   5  first: reset to 0; middle: increment to 4
  *  0       4 5  first: increment to 1
  *    1     4 5  first: increment to 2
  *      2   4 5  first: increment to 3
  *        3 4 5  done
  *
  * This strategy works for dRAID but is less efficient when there are a large
  * number of child vdevs and therefore permutations to check. Furthermore,
  * since the raidz_map_t rows likely do not overlap, reconstruction would be
  * possible as long as there are no more than nparity data errors per row.
  * These additional permutations are not currently checked but could be as
  * a future improvement.
  *
  * Returns 0 on success, ECKSUM on failure.
  */
 static int
 vdev_raidz_combrec(zio_t *zio)
 {
 	int nparity = vdev_get_nparity(zio->io_vd);
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		int total_errors = 0;
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			if (rr->rr_col[c].rc_error)
 				total_errors++;
 		}
 
 		if (total_errors > nparity)
 			return (vdev_raidz_worst_error(rr));
 	}
 
 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 		int *ltgts = &tstore[1]; /* value is logical child ID */
 
 
 		/*
 		 * Determine number of logical children, n.  See comment
 		 * above raidz_simulate_failure().
 		 */
 		int n = 0;
 		for (int w = physical_width;
 		    w >= original_width; w--) {
 			n += w;
 		}
 
 		ASSERT3U(num_failures, <=, nparity);
 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
 
 		/* Handle corner cases in combrec logic */
 		ltgts[-1] = -1;
 		for (int i = 0; i < num_failures; i++) {
 			ltgts[i] = i;
 		}
 		ltgts[num_failures] = n;
 
 		for (;;) {
 			int err = raidz_reconstruct(zio, ltgts, num_failures,
 			    nparity);
 			if (err == EINVAL) {
 				/*
 				 * Reconstruction not possible with this #
 				 * failures; try more failures.
 				 */
 				break;
 			} else if (err == 0)
 				return (0);
 
 			/* Compute next targets to try */
 			for (int t = 0; ; t++) {
 				ASSERT3U(t, <, num_failures);
 				ltgts[t]++;
 				if (ltgts[t] == n) {
 					/* try more failures */
 					ASSERT3U(t, ==, num_failures - 1);
 					if (zfs_flags &
 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 						zfs_dbgmsg("reconstruction "
 						    "failed for num_failures="
 						    "%u; tried all "
 						    "combinations",
 						    num_failures);
 					}
 					break;
 				}
 
 				ASSERT3U(ltgts[t], <, n);
 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 * Try the next combination.
 				 */
 				if (ltgts[t] != ltgts[t + 1])
 					break; // found next combination
 
 				/*
 				 * Otherwise, reset this tgt to the minimum,
 				 * and move on to the next tgt.
 				 */
 				ltgts[t] = ltgts[t - 1] + 1;
 				ASSERT3U(ltgts[t], ==, t);
 			}
 
 			/* Increase the number of failures and keep trying. */
 			if (ltgts[num_failures - 1] == n)
 				break;
 		}
 	}
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruction failed for all num_failures");
 	return (ECKSUM);
 }
 
 void
 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
 		raidz_row_t *rr = rm->rm_row[row];
 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
 	}
 }
 
 /*
  * Complete a write IO operation on a RAIDZ VDev
  *
  * Outline:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
  */
 static void
 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 {
 	int normal_errors = 0;
 	int shadow_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error != 0) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 			normal_errors++;
 		}
 		if (rc->rc_shadow_error != 0) {
 			ASSERT(rc->rc_shadow_error != ECKSUM);
 			shadow_errors++;
 		}
 	}
 
 	/*
 	 * Treat partial writes as a success. If we couldn't write enough
 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
 	 * enough.  Note that in the case of a shadow write (during raidz
 	 * expansion), depending on if we crash, either the normal (old) or
 	 * shadow (new) location may become the "real" version of the block,
 	 * so both locations must have sufficient redundancy.
 	 *
 	 * Now that we support write reallocation, it would be better
 	 * to treat partial failure as real failure unless there are
 	 * no non-degraded top-level vdevs left, and not update DTLs
 	 * if we intend to reallocate.
 	 */
 	if (normal_errors > rr->rr_firstdatacol ||
 	    shadow_errors > rr->rr_firstdatacol) {
 		zio->io_error = zio_worst_error(zio->io_error,
 		    vdev_raidz_worst_error(rr));
 	}
 }
 
 static void
 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
     raidz_row_t *rr)
 {
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * If scrubbing and a replacing/sparing child vdev determined
 		 * that not all of its children have an identical copy of the
 		 * data, then clear the error so the column is treated like
 		 * any other read and force a repair to correct the damage.
 		 */
 		if (rc->rc_error == ECKSUM) {
 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			rc->rc_force_repair = 1;
 			rc->rc_error = 0;
 		}
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			total_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	/*
 	 * If there were data errors and the number of errors we saw was
 	 * correctable -- less than or equal to the number of parity disks read
 	 * -- reconstruct based on the missing data.
 	 */
 	if (data_errors != 0 &&
 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
 		/*
 		 * We either attempt to read all the parity columns or
 		 * none of them. If we didn't try to read parity, we
 		 * wouldn't be here in the correctable case. There must
 		 * also have been fewer parity errors than parity
 		 * columns or, again, we wouldn't be in this code path.
 		 */
 		ASSERT(parity_untried == 0);
 		ASSERT(parity_errors < rr->rr_firstdatacol);
 
 		/*
 		 * Identify the data columns that reported an error.
 		 */
 		int n = 0;
 		int tgts[VDEV_RAIDZ_MAXPARITY];
 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_error != 0) {
 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 				tgts[n++] = c;
 			}
 		}
 
 		ASSERT(rr->rr_firstdatacol >= n);
 
 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
 	}
 }
 
 /*
  * Return the number of reads issued.
  */
 static int
 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	int nread = 0;
 
 	rr->rr_missingdata = 0;
 	rr->rr_missingparity = 0;
 
 	/*
 	 * If this rows contains empty sectors which are not required
 	 * for a normal read then allocate an ABD for them now so they
 	 * may be read, verified, and any needed repairs performed.
 	 */
 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
 		vdev_draid_map_alloc_empty(zio, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_tried || rc->rc_size == 0)
 			continue;
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx],
 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, rc));
 		nread++;
 	}
 	return (nread);
 }
 
 /*
  * We're here because either there were too many errors to even attempt
  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
  * failed. In either case, there is enough bad data to prevent reconstruction.
  * Start checksum ereports for all children which haven't failed.
  */
 static void
 vdev_raidz_io_done_unrecoverable(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error != 0)
 				continue;
 
 			zio_bad_cksum_t zbc;
 			zbc.zbc_has_cksum = 0;
 			zbc.zbc_injected = rm->rm_ecksuminjected;
 			mutex_enter(&cvd->vdev_stat_lock);
 			cvd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&cvd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
 			    rc->rc_size, &zbc);
 		}
 	}
 }
 
 void
 vdev_raidz_io_done(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT(zio->io_bp != NULL);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
 		}
 	} else {
 		if (rm->rm_phys_col) {
 			/*
 			 * This is an aggregated read.  Copy the data and status
 			 * from the aggregate abd's to the individual rows.
 			 */
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 
 				for (int c = 0; c < rr->rr_cols; c++) {
 					raidz_col_t *rc = &rr->rr_col[c];
 					if (rc->rc_tried || rc->rc_size == 0)
 						continue;
 
 					raidz_col_t *prc =
 					    &rm->rm_phys_col[rc->rc_devidx];
 					rc->rc_error = prc->rc_error;
 					rc->rc_tried = prc->rc_tried;
 					rc->rc_skipped = prc->rc_skipped;
 					if (c >= rr->rr_firstdatacol) {
 						/*
 						 * Note: this is slightly faster
 						 * than using abd_copy_off().
 						 */
 						char *physbuf = abd_to_buf(
 						    prc->rc_abd);
 						void *physloc = physbuf +
 						    rc->rc_offset -
 						    prc->rc_offset;
 
 						abd_copy_from_buf(rc->rc_abd,
 						    physloc, rc->rc_size);
 					}
 				}
 			}
 		}
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_done_reconstruct_known_missing(zio,
 			    rm, rr);
 		}
 
 		if (raidz_checksum_verify(zio) == 0) {
 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 				goto done;
 
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
 			zio_checksum_verified(zio);
 		} else {
 			/*
 			 * A sequential resilver has no checksum which makes
 			 * combinatoral reconstruction impossible. This code
 			 * path is unreachable since raidz_checksum_verify()
 			 * has no checksum to verify and must succeed.
 			 */
 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
 
 			/*
 			 * This isn't a typical situation -- either we got a
 			 * read error or a child silently returned bad data.
 			 * Read every block so we can try again with as much
 			 * data and parity as we can track down. If we've
 			 * already been through once before, all children will
 			 * be marked as tried so we'll proceed to combinatorial
 			 * reconstruction.
 			 */
 			int nread = 0;
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				nread += vdev_raidz_read_all(zio,
 				    rm->rm_row[i]);
 			}
 			if (nread != 0) {
 				/*
 				 * Normally our stage is VDEV_IO_DONE, but if
 				 * we've already called redone(), it will have
 				 * changed to VDEV_IO_START, in which case we
 				 * don't want to call redone() again.
 				 */
 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
 					zio_vdev_io_redone(zio);
 				return;
 			}
 			/*
 			 * It would be too expensive to try every possible
 			 * combination of failed sectors in every row, so
 			 * instead we try every combination of failed current or
 			 * past physical disk. This means that if the incorrect
 			 * sectors were all on Nparity disks at any point in the
 			 * past, we will find the correct data.  The only known
 			 * case where this is less durable than a non-expanded
 			 * RAIDZ, is if we have a silent failure during
 			 * expansion.  In that case, one block could be
 			 * partially in the old format and partially in the
 			 * new format, so we'd lost some sectors from the old
 			 * format and some from the new format.
 			 *
 			 * e.g. logical_width=4 physical_width=6
 			 * the 15 (6+5+4) possible failed disks are:
 			 * width=6 child=0
 			 * width=6 child=1
 			 * width=6 child=2
 			 * width=6 child=3
 			 * width=6 child=4
 			 * width=6 child=5
 			 * width=5 child=0
 			 * width=5 child=1
 			 * width=5 child=2
 			 * width=5 child=3
 			 * width=5 child=4
 			 * width=4 child=0
 			 * width=4 child=1
 			 * width=4 child=2
 			 * width=4 child=3
 			 * And we will try every combination of Nparity of these
 			 * failing.
 			 *
 			 * As a first pass, we can generate every combo,
 			 * and try reconstructing, ignoring any known
 			 * failures.  If any row has too many known + simulated
 			 * failures, then we bail on reconstructing with this
 			 * number of simulated failures.  As an improvement,
 			 * we could detect the number of whole known failures
 			 * (i.e. we have known failures on these disks for
 			 * every row; the disks never succeeded), and
 			 * subtract that from the max # failures to simulate.
 			 * We could go even further like the current
 			 * combrec code, but that doesn't seem like it
 			 * gains us very much.  If we simulate a failure
 			 * that is also a known failure, that's fine.
 			 */
 			zio->io_error = vdev_raidz_combrec(zio);
 			if (zio->io_error == ECKSUM &&
 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 				vdev_raidz_io_done_unrecoverable(zio);
 			}
 		}
 	}
 done:
 	if (rm->rm_lr != NULL) {
 		zfs_rangelock_exit(rm->rm_lr);
 		rm->rm_lr = NULL;
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (faulted > vdrz->vd_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.  The function
  * assumes that at least one DTL is dirty which implies that full stripe
  * width blocks must be resilvered.
  */
 static boolean_t
 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * If we're in the middle of a RAIDZ expansion, this block may be in
 	 * the old and/or new location.  For simplicity, always resilver it.
 	 */
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
 		return (B_TRUE);
 
 	uint64_t dcols = vd->vdev_children;
 	uint64_t nparity = vdrz->vd_nparity;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = ((psize - 1) >> ashift) + 1;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 
 	/* Unreachable by sequential resilver. */
 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
 
 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 		return (B_FALSE);
 
 	if (s + nparity >= dcols)
 		return (B_TRUE);
 
 	for (uint64_t c = 0; c < s + nparity; c++) {
 		uint64_t devidx = (f + c) % dcols;
 		vdev_t *cvd = vd->vdev_child[devidx];
 
 		/*
 		 * dsl_scan_need_resilver() already checked vd with
 		 * vdev_dtl_contains(). So here just check cvd with
 		 * vdev_dtl_empty(), cheaper and a good approximation.
 		 */
 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	(void) remain_rs;
 
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
 
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		/*
 		 * We're in the middle of expansion, in which case the
 		 * translation is in flux.  Any answer we give may be wrong
 		 * by the time we return, so it isn't safe for the caller to
 		 * act on it.  Therefore we say that this range isn't present
 		 * on any children.  The only consumers of this are "zpool
 		 * initialize" and trimming, both of which are "best effort"
 		 * anyway.
 		 */
 		physical_rs->rs_start = physical_rs->rs_end = 0;
 		remain_rs->rs_start = remain_rs->rs_end = 0;
 		return;
 	}
 
 	uint64_t width = vdrz->vd_physical_width;
 	uint64_t tgt_col = cvd->vdev_id;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 	uint64_t b_start = logical_rs->rs_start >> ashift;
 	uint64_t b_end = logical_rs->rs_end >> ashift;
 
 	uint64_t start_row = 0;
 	if (b_start > tgt_col) /* avoid underflow */
 		start_row = ((b_start - tgt_col - 1) / width) + 1;
 
 	uint64_t end_row = 0;
 	if (b_end > tgt_col)
 		end_row = ((b_end - tgt_col - 1) / width) + 1;
 
 	physical_rs->rs_start = start_row << ashift;
 	physical_rs->rs_end = end_row << ashift;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_rs->rs_end - logical_rs->rs_start);
 }
 
 static void
 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	/*
 	 * Ensure there are no i/os to the range that is being committed.
 	 */
 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
 
 	mutex_enter(&vre->vre_lock);
 	uint64_t new_offset =
 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
 	/*
 	 * We should not have committed anything that failed.
 	 */
 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
 	mutex_exit(&vre->vre_lock);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    old_offset, new_offset - old_offset,
 	    RL_WRITER);
 
 	/*
 	 * Update the uberblock that will be written when this txg completes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
 	vre->vre_offset_pertxg[txgoff] = 0;
 	zfs_rangelock_exit(lr);
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
 	mutex_exit(&vre->vre_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
 }
 
 static void
 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	for (int i = 0; i < TXG_SIZE; i++)
 		VERIFY0(vre->vre_offset_pertxg[i]);
 
 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
 	re->re_logical_width = vdrz->vd_physical_width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	avl_add(&vdrz->vd_expand_txgs, re);
 	mutex_exit(&vdrz->vd_expand_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	/*
 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
 	 * will get written (based on vd_expand_txgs).
 	 */
 	vdev_config_dirty(vd);
 
 	/*
 	 * Before we change vre_state, the on-disk state must reflect that we
 	 * have completed all copying, so that vdev_raidz_io_start() can use
 	 * vre_state to determine if the reflow is in progress.  See also the
 	 * end of spa_raidz_expand_thread().
 	 */
 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
 
 	vre->vre_end_time = gethrestime_sec();
 	vre->vre_state = DSS_FINISHED;
 
 	uint64_t state = vre->vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t end_time = vre->vre_end_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 	    sizeof (end_time), 1, &end_time, tx));
 
 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
 
 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)vd->vdev_id,
 	    (unsigned long long)vd->vdev_children);
 
 	spa->spa_raidz_expand = NULL;
 	raidvd->vdev_rz_expanding = B_FALSE;
 
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	spa_notify_waiters(spa);
 
 	/*
 	 * While we're in syncing context take the opportunity to
 	 * setup a scrub. All the data has been sucessfully copied
 	 * but we have not validated any checksums.
 	 */
 	setup_sync_arg_t setup_sync_arg = {
 		.func = POOL_SCAN_SCRUB,
 		.txgstart = 0,
 		.txgend = 0,
 	};
 	if (zfs_scrub_after_expand &&
 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 }
 
 /*
  * State of one copy batch.
  */
 typedef struct raidz_reflow_arg {
 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
 	uint64_t rra_txg;	/* TXG of this batch. */
 	uint_t rra_ashift;	/* Ashift of the vdev. */
 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
 	uint32_t rra_writes;	/* Number of write ZIOs. */
 	zio_t *rra_zio[];	/* Write ZIO pointers. */
 } raidz_reflow_arg_t;
 
 /*
  * Write of the new location on one child is done.  Once all of them are done
  * we can unlock and free everything.
  */
 static void
 raidz_reflow_write_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vre->vre_lock);
 	if (zio->io_error != 0) {
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 	}
 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
 	vre->vre_outstanding_bytes -= zio->io_size;
 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
 	    vre->vre_failed_offset) {
 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
 		    zio->io_size;
 	}
 	cv_signal(&vre->vre_cv);
 	boolean_t done = (--rra->rra_tbd == 0);
 	mutex_exit(&vre->vre_lock);
 
 	if (!done)
 		return;
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 	zfs_rangelock_exit(rra->rra_lr);
 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
 }
 
 /*
  * Read of the old location on one child is done.  Once all of them are done
  * writes should have all the data and we can issue them.
  */
 static void
 raidz_reflow_read_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
 	if (zio->io_size > (1 << rra->rra_ashift))
 		abd_free(zio->io_abd);
 
 	/*
 	 * If the read failed, or if it was done on a vdev that is not fully
 	 * healthy (e.g. a child that has a resilver in progress), we may not
 	 * have the correct data.  Note that it's OK if the write proceeds.
 	 * It may write garbage but the location is otherwise unused and we
 	 * will retry later due to vre_failed_offset.
 	 */
 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_lr->lr_length,
 		    (long long)rra->rra_txg,
 		    zio->io_error,
 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
 		mutex_enter(&vre->vre_lock);
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		mutex_exit(&vre->vre_lock);
 	}
 
 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
 		return;
 	rra->rra_tbd = rra->rra_writes;
 	for (uint64_t i = 0; i < rra->rra_writes; i++)
 		zio_nowait(rra->rra_zio[i]);
 }
 
 static void
 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
     dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (offset == 0)
 		return;
 
 	mutex_enter(&vre->vre_lock);
 	ASSERT3U(vre->vre_offset, <=, offset);
 	vre->vre_offset = offset;
 	mutex_exit(&vre->vre_lock);
 
 	if (vre->vre_offset_pertxg[txgoff] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
 		    spa, tx);
 	}
 	vre->vre_offset_pertxg[txgoff] = offset;
 }
 
 static boolean_t
 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
 {
 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
 		/* Quick check if a child is being replaced */
 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
     dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint_t ashift = vd->vdev_top->vdev_ashift;
 
 	range_seg_t *rs = range_tree_first(rt);
 	if (rt == NULL)
 		return (B_FALSE);
 	uint64_t offset = rs_get_start(rs, rt);
 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
 	uint64_t size = rs_get_end(rs, rt) - offset;
 	ASSERT3U(size, >=, 1 << ashift);
 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
 
 	uint64_t blkid = offset >> ashift;
 	uint_t old_children = vd->vdev_children - 1;
 
 	/*
 	 * We can only progress to the point that writes will not overlap
 	 * with blocks whose progress has not yet been recorded on disk.
 	 * Since partially-copied rows are still read from the old location,
 	 * we need to stop one row before the sector-wise overlap, to prevent
 	 * row-wise overlap.
 	 *
 	 * Note that even if we are skipping over a large unallocated region,
 	 * we can't move the on-disk progress to `offset`, because concurrent
 	 * writes/allocations could still use the currently-unallocated
 	 * region.
 	 */
 	uint64_t ubsync_blkid =
 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
 	uint64_t next_overwrite_blkid = ubsync_blkid +
 	    ubsync_blkid / old_children - old_children;
 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
 	if (blkid >= next_overwrite_blkid) {
 		raidz_reflow_record_progress(vre,
 		    next_overwrite_blkid << ashift, tx);
 		return (B_TRUE);
 	}
 
 	size = MIN(size, raidz_expand_max_copy_bytes);
 	size = MIN(size, (uint64_t)old_children *
 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
 	size = MAX(size, 1 << ashift);
 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
 	size = (uint64_t)blocks << ashift;
 
 	range_tree_remove(rt, offset, size);
 
 	uint_t reads = MIN(blocks, old_children);
 	uint_t writes = MIN(blocks, vd->vdev_children);
 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
 	    sizeof (zio_t *) * writes, KM_SLEEP);
 	rra->rra_vre = vre;
 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    offset, size, RL_WRITER);
 	rra->rra_txg = dmu_tx_get_txg(tx);
 	rra->rra_ashift = ashift;
 	rra->rra_tbd = reads;
 	rra->rra_writes = writes;
 
 	raidz_reflow_record_progress(vre, offset + size, tx);
 
 	/*
 	 * SCL_STATE will be released when the read and write are done,
 	 * by raidz_reflow_write_done().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/* check if a replacing vdev was added, if so treat it as an error */
 	if (vdev_raidz_expand_child_replacing(vd)) {
 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
 		    "offset=%llu txg=%llu",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_txg);
 
 		mutex_enter(&vre->vre_lock);
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		cv_signal(&vre->vre_cv);
 		mutex_exit(&vre->vre_lock);
 
 		/* drop everything we acquired */
 		spa_config_exit(spa, SCL_STATE, spa);
 		zfs_rangelock_exit(rra->rra_lr);
 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
 		return (B_TRUE);
 	}
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_outstanding_bytes += size;
 	mutex_exit(&vre->vre_lock);
 
 	/* Allocate ABD and ZIO for each child we write. */
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	zio_t *pio = spa->spa_txg_zio[txgoff];
 	uint_t b = blocks / vd->vdev_children;
 	uint_t bb = blocks % vd->vdev_children;
 	for (uint_t i = 0; i < writes; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
 		    ((blkid + i) / vd->vdev_children) << ashift,
 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
 	}
 
 	/*
 	 * Allocate and issue ZIO for each child we read.  For reads of only
 	 * one block we can use respective writer ABDs, since they will also
 	 * have only one block.  For bigger reads create gang ABDs and fill
 	 * them with respective blocks from writer ABDs.
 	 */
 	b = blocks / old_children;
 	bb = blocks % old_children;
 	for (uint_t i = 0; i < reads; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd;
 		if (n > 1) {
 			abd = abd_alloc_gang();
 			for (uint_t j = 0; j < n; j++) {
 				uint_t b = j * old_children + i;
 				abd_t *cabd = abd_get_offset_size(
 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
 				    (b / vd->vdev_children) << ashift,
 				    1 << ashift);
 				abd_gang_add(abd, cabd, B_TRUE);
 			}
 		} else {
 			abd = rra->rra_zio[i]->io_abd;
 		}
 		zio_nowait(zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % old_children],
 		    ((blkid + i) / old_children) << ashift, abd,
 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * For testing (ztest specific)
  */
 static void
 raidz_expand_pause(uint_t pause_point)
 {
 	while (raidz_expand_pause_point != 0 &&
 	    raidz_expand_pause_point <= pause_point)
 		delay(hz);
 }
 
 static void
 raidz_scratch_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Reflow the beginning portion of the vdev into an intermediate scratch area
  * in memory and on disk. This operation must be persisted on disk before we
  * proceed to overwrite the beginning portion with the reflowed data.
  *
  * This multi-step task can fail to complete if disk errors are encountered
  * and we can return here after a pause (waiting for disk to become healthy).
  */
 static void
 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_raidz_expand_t *vre = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zio_t *pio;
 	int error;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	int ashift = raidvd->vdev_ashift;
 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
 	    uint64_t);
 	uint64_t logical_size = write_size * raidvd->vdev_children;
 	uint64_t read_size =
 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
 	    1 << ashift);
 
 	/*
 	 * The scratch space must be large enough to get us to the point
 	 * that one row does not overlap itself when moved.  This is checked
 	 * by vdev_raidz_attach_check().
 	 */
 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
 	VERIFY3U(write_size, <=, read_size);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    0, logical_size, RL_WRITER);
 
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
 
 	/*
 	 * If we have already written the scratch area then we must read from
 	 * there, since new writes were redirected there while we were paused
 	 * or the original location may have been partially overwritten with
 	 * reflowed data.
 	 */
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
 		/*
 		 * Read from scratch space.
 		 */
 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (int i = 0; i < raidvd->vdev_children; i++) {
 			/*
 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
 			 * to the offset to calculate the physical offset to
 			 * write to.  Passing in a negative offset makes us
 			 * access the scratch area.
 			 */
 			zio_nowait(zio_vdev_child_io(pio, NULL,
 			    raidvd->vdev_child[i],
 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 		}
 		error = zio_wait(pio);
 		if (error != 0) {
 			zfs_dbgmsg("reflow: error %d reading scratch location",
 			    error);
 			goto io_error_exit;
 		}
 		goto overwrite;
 	}
 
 	/*
 	 * Read from original location.
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], read_size, ZIO_TYPE_READ,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d reading original location", error);
 io_error_exit:
 		for (int i = 0; i < raidvd->vdev_children; i++)
 			abd_free(abds[i]);
 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 		zfs_rangelock_exit(lr);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 		return;
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
 
 	/*
 	 * Reflow in memory.
 	 */
 	uint64_t logical_sectors = logical_size >> ashift;
 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
 		int oldchild = i % (raidvd->vdev_children - 1);
 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
 
 		int newchild = i % raidvd->vdev_children;
 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
 
 		/* a single sector should not be copying over itself */
 		ASSERT(!(newchild == oldchild && newoff == oldoff));
 
 		abd_copy_off(abds[newchild], abds[oldchild],
 		    newoff, oldoff, 1 << ashift);
 	}
 
 	/*
 	 * Verify that we filled in everything we intended to (write_size on
 	 * each child).
 	 */
 	VERIFY0(logical_sectors % raidvd->vdev_children);
 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
 	    write_size);
 
 	/*
 	 * Write to scratch location (boot area).
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
 	    (long long)logical_size);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
 
 	/*
 	 * Update uberblock to indicate that scratch space is valid.  This is
 	 * needed because after this point, the real location may be
 	 * overwritten.  If we crash, we need to get the data from the
 	 * scratch space, rather than the real location.
 	 *
 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
 	 * will prefer this uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
 
 	/*
 	 * Overwrite with reflow'ed data.
 	 */
 overwrite:
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		/*
 		 * When we exit early here and drop the range lock, new
 		 * writes will go into the scratch area so we'll need to
 		 * read from there when we return after pausing.
 		 */
 		zfs_dbgmsg("reflow: error %d writing real location", error);
 		/*
 		 * Update the uberblock that is written when this txg completes.
 		 */
 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
 		    logical_size);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
 	    (long long)logical_size);
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
 
 	/*
 	 * Update uberblock to indicate that the initial part has been
 	 * reflow'ed.  This is needed because after this point (when we exit
 	 * the rangelock), we allow regular writes to this region, which will
 	 * be written to the new location only (because reflow_offset_next ==
 	 * reflow_offset_synced).  If we crashed and re-copied from the
 	 * scratch space, we would lose the regular writes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
 	    logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
 
 	/*
 	 * Update progress.
 	 */
 	vre->vre_offset = logical_size;
 	zfs_rangelock_exit(lr);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note - raidz_reflow_sync() will update the uberblock state to
 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
 }
 
 /*
  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
  */
 void
 vdev_raidz_reflow_copy_scratch(spa_t *spa)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	ASSERT0(logical_size % raidvd->vdev_children);
 	uint64_t write_size = logical_size / raidvd->vdev_children;
 
 	zio_t *pio;
 
 	/*
 	 * Read from scratch space.
 	 */
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
 	}
 
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 
 	/*
 	 * Overwrite real location with reflow'ed data.
 	 */
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
 	    "to real location", (long long)logical_size);
 
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	/*
 	 * Update uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow recovery: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 	    spa_first_txg(spa));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset = logical_size;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note that raidz_reflow_sync() will update the uberblock once more
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	dmu_tx_commit(tx);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 static boolean_t
 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_raidz_expand != NULL &&
 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
 }
 
 /*
  * RAIDZ expansion background thread
  *
  * Can be called multiple times if the reflow is paused
  */
 static void
 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
 		vre->vre_offset = 0;
 	else
 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
 
 	/* Reflow the begining portion using the scratch area */
 	if (vre->vre_offset == 0) {
 		VERIFY0(dsl_sync_task(spa_name(spa),
 		    NULL, raidz_reflow_scratch_sync,
 		    vre, 0, ZFS_SPACE_CHECK_NONE));
 
 		/* if we encountered errors then pause */
 		if (vre->vre_offset == 0) {
 			mutex_enter(&vre->vre_lock);
 			vre->vre_waiting_for_resilver = B_TRUE;
 			mutex_exit(&vre->vre_lock);
 			return;
 		}
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	uint64_t guid = raidvd->vdev_guid;
 
 	/* Iterate over all the remaining metaslabs */
 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
 	    i < raidvd->vdev_ms_count &&
 	    !zthr_iscancelled(zthr) &&
 	    vre->vre_failed_offset == UINT64_MAX; i++) {
 		metaslab_t *msp = raidvd->vdev_ms[i];
 
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * The metaslab may be newly created (for the expanded
 		 * space), in which case its trees won't exist yet,
 		 * so we need to bail out early.
 		 */
 		if (msp->ms_new) {
 			mutex_exit(&msp->ms_lock);
 			metaslab_enable(msp, B_FALSE, B_FALSE);
 			continue;
 		}
 
 		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * We want to copy everything except the free (allocatable)
 		 * space.  Note that there may be a little bit more free
 		 * space (e.g. in ms_defer), and it's fine to copy that too.
 		 */
 		uint64_t shift, start;
 		range_seg_type_t type = metaslab_calculate_range_tree_type(
 		    raidvd, msp, &start, &shift);
 		range_tree_t *rt = range_tree_create(NULL, type, NULL,
 		    start, shift);
 		range_tree_add(rt, msp->ms_start, msp->ms_size);
 		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
 		mutex_exit(&msp->ms_lock);
 
 		/*
 		 * Force the last sector of each metaslab to be copied.  This
 		 * ensures that we advance the on-disk progress to the end of
 		 * this metaslab while the metaslab is disabled.  Otherwise, we
 		 * could move past this metaslab without advancing the on-disk
 		 * progress, and then an allocation to this metaslab would not
 		 * be copied.
 		 */
 		int sectorsz = 1 << raidvd->vdev_ashift;
 		uint64_t ms_last_offset = msp->ms_start +
 		    msp->ms_size - sectorsz;
 		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
 			range_tree_add(rt, ms_last_offset, sectorsz);
 		}
 
 		/*
 		 * When we are resuming from a paused expansion (i.e.
 		 * when importing a pool with a expansion in progress),
 		 * discard any state that we have already processed.
 		 */
 		if (vre->vre_offset > msp->ms_start) {
 			range_tree_clear(rt, msp->ms_start,
 			    vre->vre_offset - msp->ms_start);
 		}
 
 		while (!zthr_iscancelled(zthr) &&
 		    !range_tree_is_empty(rt) &&
 		    vre->vre_failed_offset == UINT64_MAX) {
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * If requested, pause the reflow when the amount
 			 * specified by raidz_expand_max_reflow_bytes is reached
 			 *
 			 * This pause is only used during testing or debugging.
 			 */
 			while (raidz_expand_max_reflow_bytes != 0 &&
 			    raidz_expand_max_reflow_bytes <=
 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
 				delay(hz);
 			}
 
 			mutex_enter(&vre->vre_lock);
 			while (vre->vre_outstanding_bytes >
 			    raidz_expand_max_copy_bytes) {
 				cv_wait(&vre->vre_cv, &vre->vre_lock);
 			}
 			mutex_exit(&vre->vre_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  Theoretically, the
 			 * vdev_t that we're expanding may have changed.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 			boolean_t needsync =
 			    raidz_reflow_impl(raidvd, vre, rt, tx);
 
 			dmu_tx_commit(tx);
 
 			if (needsync) {
 				spa_config_exit(spa, SCL_CONFIG, FTAG);
 				txg_wait_synced(spa->spa_dsl_pool, txg);
 				spa_config_enter(spa, SCL_CONFIG, FTAG,
 				    RW_READER);
 			}
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 		metaslab_enable(msp, B_FALSE, B_FALSE);
 		range_tree_vacate(rt, NULL, NULL);
 		range_tree_destroy(rt);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * The txg_wait_synced() here ensures that all reflow zio's have
 	 * completed, and vre_failed_offset has been set if necessary.  It
 	 * also ensures that the progress of the last raidz_reflow_sync() is
 	 * written to disk before raidz_reflow_complete_sync() changes the
 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
 	 * determine if a reflow is in progress, in which case we may need to
 	 * write to both old and new locations.  Therefore we can only change
 	 * vre_state once this is not necessary, which is once the on-disk
 	 * progress (in spa_ubsync) has been set past any possible writes (to
 	 * the end of the last metaslab).
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	if (!zthr_iscancelled(zthr) &&
 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
 		/*
 		 * We are not being canceled or paused, so the reflow must be
 		 * complete. In that case also mark it as completed on disk.
 		 */
 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 		    raidz_reflow_complete_sync, spa,
 		    0, ZFS_SPACE_CHECK_NONE));
 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
 	} else {
 		/*
 		 * Wait for all copy zio's to complete and for all the
 		 * raidz_reflow_sync() synctasks to be run.
 		 */
 		spa_history_log_internal(spa, "reflow pause",
 		    NULL, "offset=%llu failed_offset=%lld",
 		    (long long)vre->vre_offset,
 		    (long long)vre->vre_failed_offset);
 		mutex_enter(&vre->vre_lock);
 		if (vre->vre_failed_offset != UINT64_MAX) {
 			/*
 			 * Reset progress so that we will retry everything
 			 * after the point that something failed.
 			 */
 			vre->vre_offset = vre->vre_failed_offset;
 			vre->vre_failed_offset = UINT64_MAX;
 			vre->vre_waiting_for_resilver = B_TRUE;
 		}
 		mutex_exit(&vre->vre_lock);
 	}
 }
 
 void
 spa_start_raidz_expansion_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
 	    spa, defclsyspri);
 }
 
 void
 raidz_dtl_reassessed(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	if (spa->spa_raidz_expand != NULL) {
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		/*
 		 * we get called often from vdev_dtl_reassess() so make
 		 * sure it's our vdev and any replacing is complete
 		 */
 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
 			mutex_enter(&vre->vre_lock);
 			if (vre->vre_waiting_for_resilver) {
 				vdev_dbgmsg(vd, "DTL reassessed, "
 				    "continuing raidz expansion");
 				vre->vre_waiting_for_resilver = B_FALSE;
 				zthr_wakeup(spa->spa_raidz_expand_zthr);
 			}
 			mutex_exit(&vre->vre_lock);
 		}
 	}
 }
 
 int
 vdev_raidz_attach_check(vdev_t *new_child)
 {
 	vdev_t *raidvd = new_child->vdev_parent;
 	uint64_t new_children = raidvd->vdev_children;
 
 	/*
 	 * We use the "boot" space as scratch space to handle overwriting the
 	 * initial part of the vdev.  If it is too small, then this expansion
 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
 	 * >200 children).
 	 */
 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 void
 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *new_child = arg;
 	spa_t *spa = new_child->vdev_spa;
 	vdev_t *raidvd = new_child->vdev_parent;
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
 	    new_child);
 
 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
 
 	vdrz->vd_physical_width++;
 
 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
 	vdrz->vn_vre.vre_offset = 0;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	spa->spa_raidz_expand = &vdrz->vn_vre;
 	zthr_wakeup(spa->spa_raidz_expand_zthr);
 
 	/*
 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
 	 * written to the config.
 	 */
 	vdev_config_dirty(raidvd);
 
 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
 	vdrz->vn_vre.vre_end_time = 0;
 	vdrz->vn_vre.vre_state = DSS_SCANNING;
 	vdrz->vn_vre.vre_bytes_copied = 0;
 
 	uint64_t state = vdrz->vn_vre.vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 	    sizeof (start_time), 1, &start_time, tx));
 
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
 
 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)raidvd->vdev_id,
 	    (unsigned long long)raidvd->vdev_children);
 }
 
 int
 vdev_raidz_load(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	int err;
 
 	uint64_t state = DSS_NONE;
 	uint64_t start_time = 0;
 	uint64_t end_time = 0;
 	uint64_t bytes_copied = 0;
 
 	if (vd->vdev_top_zap != 0) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 		    sizeof (state), 1, &state);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 		    sizeof (start_time), 1, &start_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 		    sizeof (end_time), 1, &end_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 		    sizeof (bytes_copied), 1, &bytes_copied);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	/*
 	 * If we are in the middle of expansion, vre_state should have
 	 * already been set by vdev_raidz_init().
 	 */
 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
 	vdrz->vn_vre.vre_start_time = start_time;
 	vdrz->vn_vre.vre_end_time = end_time;
 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
 
 	return (0);
 }
 
 int
 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (vre == NULL) {
 		/* no removal in progress; find most recent completed */
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 			if (vd->vdev_ops == &vdev_raidz_ops) {
 				vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 				if (vdrz->vn_vre.vre_end_time != 0 &&
 				    (vre == NULL ||
 				    vdrz->vn_vre.vre_end_time >
 				    vre->vre_end_time)) {
 					vre = &vdrz->vn_vre;
 				}
 			}
 		}
 	}
 
 	if (vre == NULL) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	pres->pres_state = vre->vre_state;
 	pres->pres_expanding_vdev = vre->vre_vdev_id;
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
 
 	mutex_enter(&vre->vre_lock);
 	pres->pres_reflowed = vre->vre_bytes_copied;
 	for (int i = 0; i < TXG_SIZE; i++)
 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
 	mutex_exit(&vre->vre_lock);
 
 	pres->pres_start_time = vre->vre_start_time;
 	pres->pres_end_time = vre->vre_end_time;
 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
 
 	return (0);
 }
 
 /*
  * Initialize private RAIDZ specific fields from the nvlist.
  */
 static int
 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	uint_t children;
 	nvlist_t **child;
 	int error = nvlist_lookup_nvlist_array(nv,
 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	uint64_t nparity;
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Previous versions could only support 1 or 2 parity
 		 * device.
 		 */
 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * We require the parity to be specified for SPAs that
 		 * support multiple parity levels.
 		 */
 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Otherwise, we default to 1 parity device for RAID-Z.
 		 */
 		nparity = 1;
 	}
 
 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
 	vdrz->vn_vre.vre_vdev_id = -1;
 	vdrz->vn_vre.vre_offset = UINT64_MAX;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
 
 	vdrz->vd_physical_width = children;
 	vdrz->vd_nparity = nparity;
 
 	/* note, the ID does not exist when creating a pool */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
 	    &vdrz->vn_vre.vre_vdev_id);
 
 	boolean_t reflow_in_progress =
 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	if (reflow_in_progress) {
 		spa->spa_raidz_expand = &vdrz->vn_vre;
 		vdrz->vn_vre.vre_state = DSS_SCANNING;
 	}
 
 	vdrz->vd_original_width = children;
 	uint64_t *txgs;
 	unsigned int txgs_size = 0;
 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 	    &txgs, &txgs_size);
 	if (error == 0) {
 		for (int i = 0; i < txgs_size; i++) {
 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 			re->re_txg = txgs[txgs_size - i - 1];
 			re->re_logical_width = vdrz->vd_physical_width - i;
 
 			if (reflow_in_progress)
 				re->re_logical_width--;
 
 			avl_add(&vdrz->vd_expand_txgs, re);
 		}
 
 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
 	}
 	if (reflow_in_progress) {
 		vdrz->vd_original_width--;
 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
 		    children, txgs_size);
 	}
 
 	*tsd = vdrz;
 
 	return (0);
 }
 
 static void
 vdev_raidz_fini(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
 		vd->vdev_spa->spa_raidz_expand = NULL;
 	reflow_node_t *re;
 	void *cookie = NULL;
 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
 		kmem_free(re, sizeof (*re));
 	avl_destroy(&vdrz->vd_expand_txgs);
 	mutex_destroy(&vdrz->vd_expand_lock);
 	mutex_destroy(&vdrz->vn_vre.vre_lock);
 	cv_destroy(&vdrz->vn_vre.vre_cv);
 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
 	kmem_free(vdrz, sizeof (*vdrz));
 }
 
 /*
  * Add RAIDZ specific fields to the config nvlist.
  */
 static void
 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * Make sure someone hasn't managed to sneak a fancy new vdev
 	 * into a crufty old storage pool.
 	 */
 	ASSERT(vdrz->vd_nparity == 1 ||
 	    (vdrz->vd_nparity <= 2 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
 	    (vdrz->vd_nparity <= 3 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
 
 	/*
 	 * Note that we'll add these even on storage pools where they
 	 * aren't strictly required -- older software will just ignore
 	 * it.
 	 */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	}
 
 	mutex_enter(&vdrz->vd_expand_lock);
 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
 		    KM_SLEEP);
 		uint64_t i = 0;
 
 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
 			txgs[i++] = re->re_txg;
 		}
 
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 		    txgs, count);
 
 		kmem_free(txgs, sizeof (uint64_t) * count);
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 }
 
 static uint64_t
 vdev_raidz_nparity(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	return (vdrz->vd_nparity);
 }
 
 static uint64_t
 vdev_raidz_ndisks(vdev_t *vd)
 {
 	return (vd->vdev_children);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	.vdev_op_init = vdev_raidz_init,
 	.vdev_op_fini = vdev_raidz_fini,
 	.vdev_op_open = vdev_raidz_open,
 	.vdev_op_close = vdev_raidz_close,
 	.vdev_op_asize = vdev_raidz_asize,
 	.vdev_op_min_asize = vdev_raidz_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_raidz_io_start,
 	.vdev_op_io_done = vdev_raidz_io_done,
 	.vdev_op_state_change = vdev_raidz_state_change,
 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_raidz_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_raidz_config_generate,
 	.vdev_op_nparity = vdev_raidz_nparity,
 	.vdev_op_ndisks = vdev_raidz_ndisks,
 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
 	"Max amount of concurrent i/o for RAIDZ expansion");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 	"For expanded RAIDZ, aggregate reads that have more rows than this");
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
-/* END CSTYLED */
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
index 1249657f9d72..08c85a874803 100644
--- a/module/zfs/vdev_removal.c
+++ b/module/zfs/vdev_removal.c
@@ -1,2568 +1,2566 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_dir.h>
 #include <sys/arc.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/trace_zfs.h>
 
 /*
  * This file contains the necessary logic to remove vdevs from a
  * storage pool.  Currently, the only devices that can be removed
  * are log, cache, and spare devices; and top level vdevs from a pool
  * w/o raidz or mirrors.  (Note that members of a mirror can be removed
  * by the detach operation.)
  *
  * Log vdevs are removed by evacuating them and then turning the vdev
  * into a hole vdev while holding spa config locks.
  *
  * Top level vdevs are removed and converted into an indirect vdev via
  * a multi-step process:
  *
  *  - Disable allocations from this device (spa_vdev_remove_top).
  *
  *  - From a new thread (spa_vdev_remove_thread), copy data from
  *    the removing vdev to a different vdev.  The copy happens in open
  *    context (spa_vdev_copy_impl) and issues a sync task
  *    (vdev_mapping_sync) so the sync thread can update the partial
  *    indirect mappings in core and on disk.
  *
  *  - If a free happens during a removal, it is freed from the
  *    removing vdev, and if it has already been copied, from the new
  *    location as well (free_from_removing_vdev).
  *
  *  - After the removal is completed, the copy thread converts the vdev
  *    into an indirect vdev (vdev_remove_complete) before instructing
  *    the sync thread to destroy the space maps and finish the removal
  *    (spa_finish_removal).
  */
 
 typedef struct vdev_copy_arg {
 	metaslab_t	*vca_msp;
 	uint64_t	vca_outstanding_bytes;
 	uint64_t	vca_read_error_bytes;
 	uint64_t	vca_write_error_bytes;
 	kcondvar_t	vca_cv;
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
 /*
  * The maximum amount of memory we can use for outstanding i/o while
  * doing a device removal.  This determines how much i/o we can have
  * in flight concurrently.
  */
 static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
  * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
  * there is a performance problem with attempting to allocate large blocks,
  * consider decreasing this.
  *
  * See also the accessor function spa_remove_max_segment().
  */
 uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
  * encounters hard IO error during the removal process the removal will
  * not be cancelled.  This can result in a normally recoverable block
  * becoming permanently damaged and is not recommended.
  */
 static int zfs_removal_ignore_errors = 0;
 
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
  * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
  * for iops.  The value here was chosen to align with
  * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
  * reads (but there's no reason it has to be the same).
  *
  * Additionally, a higher span will have the following relatively minor
  * effects:
  *  - the mapping will be smaller, since one entry can cover more allocated
  *    segments
  *  - more of the fragmentation in the removing device will be preserved
  *  - we'll do larger allocations, which may fail and fall back on smaller
  *    allocations
  */
 uint_t vdev_removal_max_span = 32 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
 int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
 static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
 {
 	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys, tx));
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid =
 		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 vdev_activate(vdev_t *vd)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t vdev_space = spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 
 	ASSERT(!vd->vdev_islog);
 	ASSERT(vd->vdev_noalloc);
 
 	metaslab_group_activate(mg);
 	metaslab_group_activate(vd->vdev_log_mg);
 
 	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
 
 	spa->spa_nonallocating_dspace -= vdev_space;
 
 	vd->vdev_noalloc = B_FALSE;
 }
 
 static int
 vdev_passivate(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 
 	ASSERT(!vd->vdev_noalloc);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mg->mg_class == normal) {
 		/*
 		 * We must check that this is not the only allocating device in
 		 * the pool before passivating, otherwise we will not be able
 		 * to make progress because we can't allocate from any vdevs.
 		 */
 		boolean_t last = B_TRUE;
 		for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 			vdev_t *cvd = rvd->vdev_child[id];
 
 			if (cvd == vd ||
 			    cvd->vdev_ops == &vdev_indirect_ops)
 				continue;
 
 			metaslab_class_t *mc = cvd->vdev_mg->mg_class;
 			if (mc != normal)
 				continue;
 
 			if (!cvd->vdev_noalloc) {
 				last = B_FALSE;
 				break;
 			}
 		}
 		if (last)
 			return (SET_ERROR(EINVAL));
 	}
 
 	metaslab_group_passivate(mg);
 	ASSERT(!vd->vdev_islog);
 	metaslab_group_passivate(vd->vdev_log_mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * We must ensure that no "stubby" log blocks are allocated
 	 * on the device to be removed.  These blocks could be
 	 * written at any time, including while we are in the middle
 	 * of copying them.
 	 */
 	error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT(!vd->vdev_islog);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 		return (error);
 	}
 
 	spa->spa_nonallocating_dspace += spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 	vd->vdev_noalloc = B_TRUE;
 
 	return (0);
 }
 
 /*
  * Turn off allocations for a top-level device from the pool.
  *
  * Turning off allocations for a top-level device can take a significant
  * amount of time. As a result we use the spa_vdev_config_[enter/exit]
  * functions which allow us to grab and release the spa_config_lock while
  * still holding the namespace lock. During each step the configuration
  * is synced out.
  */
 int
 spa_vdev_noalloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_noalloc)
 		error = vdev_passivate(vd, &txg);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	error = spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 int
 spa_vdev_alloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_removing)
 		vdev_activate(vd);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
     int count, nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
 	    count - 1);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 static spa_vdev_removal_t *
 spa_vdev_removal_create(vdev_t *vd)
 {
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
 		    0, 0);
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
 	}
 
 	return (svr);
 }
 
 void
 spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 {
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 		ASSERT0(svr->svr_max_offset_to_sync[i]);
 		range_tree_destroy(svr->svr_frees[i]);
 		list_destroy(&svr->svr_new_segments[i]);
 	}
 
 	range_tree_destroy(svr->svr_allocd_segs);
 	mutex_destroy(&svr->svr_lock);
 	cv_destroy(&svr->svr_cv);
 	kmem_free(svr, sizeof (*svr));
 }
 
 /*
  * This is called as a synctask in the txg in which we will mark this vdev
  * as removing (in the config stored in the MOS).
  *
  * It begins the evacuation of a toplevel vdev by:
  * - initializing the spa_removing_phys which tracks this removal
  * - computing the amount of space to remove for accounting purposes
  * - dirtying all dbufs in the spa_config_object
  * - creating the spa_vdev_removal
  * - starting the spa_vdev_remove_thread
  */
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
 
 	ASSERT0(vdev_get_nparity(vd));
 	svr = spa_vdev_removal_create(vd);
 
 	ASSERT(vd->vdev_removing);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		/*
 		 * By activating the OBSOLETE_COUNTS feature, we prevent
 		 * the pool from being downgraded and ensure that the
 		 * refcounts are precise.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		uint64_t one = 1;
 		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
 		    &one, tx));
 		boolean_t are_precise __maybe_unused;
 		ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		ASSERT3B(are_precise, ==, B_TRUE);
 	}
 
 	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
 	vd->vdev_indirect_mapping =
 	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
 	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
 	vd->vdev_indirect_births =
 	    vdev_indirect_births_open(mos, vic->vic_births_object);
 	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
 	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
 	spa->spa_removing_phys.sr_end_time = 0;
 	spa->spa_removing_phys.sr_state = DSS_SCANNING;
 	spa->spa_removing_phys.sr_to_copy = 0;
 	spa->spa_removing_phys.sr_copied = 0;
 
 	/*
 	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
 	 * there may be space in the defer tree, which is free, but still
 	 * counted in vs_alloc.
 	 */
 	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 		metaslab_t *ms = vd->vdev_ms[i];
 		if (ms->ms_sm == NULL)
 			continue;
 
 		spa->spa_removing_phys.sr_to_copy +=
 		    metaslab_allocated_space(ms);
 
 		/*
 		 * Space which we are freeing this txg does not need to
 		 * be copied.
 		 */
 		spa->spa_removing_phys.sr_to_copy -=
 		    range_tree_space(ms->ms_freeing);
 
 		ASSERT0(range_tree_space(ms->ms_freed));
 		for (int t = 0; t < TXG_SIZE; t++)
 			ASSERT0(range_tree_space(ms->ms_allocating[t]));
 	}
 
 	/*
 	 * Sync tasks are called before metaslab_sync(), so there should
 	 * be no already-synced metaslabs in the TXG_CLEAN list.
 	 */
 	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
 
 	spa_sync_removing_state(spa, tx);
 
 	/*
 	 * All blocks that we need to read the most recent mapping must be
 	 * stored on concrete vdevs.  Therefore, we must dirty anything that
 	 * is read before spa_remove_init().  Specifically, the
 	 * spa_config_object.  (Note that although we already modified the
 	 * spa_config_object in spa_sync_removing_state, that may not have
 	 * modified all blocks of the object.)
 	 */
 	dmu_object_info_t doi;
 	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
 	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
 		dmu_buf_t *dbuf;
 		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    offset, FTAG, &dbuf, 0));
 		dmu_buf_will_dirty(dbuf, tx);
 		offset += dbuf->db_size;
 		dmu_buf_rele(dbuf, FTAG);
 	}
 
 	/*
 	 * Now that we've allocated the im_object, dirty the vdev to ensure
 	 * that the object gets written to the config on disk.
 	 */
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
 	    "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd,
 	    (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)vic->vic_mapping_object);
 
 	spa_history_log_internal(spa, "vdev remove started", tx,
 	    "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 	/*
 	 * Setting spa_vdev_removal causes subsequent frees to call
 	 * free_from_removing_vdev().  Note that we don't need any locking
 	 * because we are the sync thread, and metaslab_free_impl() is only
 	 * called from syncing context (potentially from a zio taskq thread,
 	 * but in any case only when there are outstanding free i/os, which
 	 * there are not).
 	 */
 	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
 	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * When we are opening a pool, we must read the mapping for each
  * indirect vdev in order from most recently removed to least
  * recently removed.  We do this because the blocks for the mapping
  * of older indirect vdevs may be stored on more recently removed vdevs.
  * In order to read each indirect mapping object, we must have
  * initialized all more recently removed vdevs.
  */
 int
 spa_remove_init(spa_t *spa)
 {
 	int error;
 
 	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys);
 
 	if (error == ENOENT) {
 		spa->spa_removing_phys.sr_state = DSS_NONE;
 		spa->spa_removing_phys.sr_removing_vdev = -1;
 		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 		spa->spa_indirect_vdevs_loaded = B_TRUE;
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
 		/*
 		 * We are currently removing a vdev.  Create and
 		 * initialize a spa_vdev_removal_t from the bonus
 		 * buffer of the removing vdevs vdev_im_object, and
 		 * initialize its partial mapping.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
 
 		if (vd == NULL) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
 		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
 		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != UINT64_MAX) {
 		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	/*
 	 * Now that we've loaded all the indirect mappings, we can allow
 	 * reads from other blocks (e.g. via predictive prefetch).
 	 */
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 	return (0);
 }
 
 void
 spa_restart_removal(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * In general when this function is called there is no
 	 * removal thread running. The only scenario where this
 	 * is not true is during spa_import() where this function
 	 * is called twice [once from spa_import_impl() and
 	 * spa_async_resume()]. Thus, in the scenario where we
 	 * import a pool that has an ongoing removal we don't
 	 * want to spawn a second thread.
 	 */
 	if (svr->svr_thread != NULL)
 		return;
 
 	if (!spa_writeable(spa))
 		return;
 
 	zfs_dbgmsg("restarting removal of %llu",
 	    (u_longlong_t)svr->svr_vdev_id);
 	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * Process freeing from a device which is in the middle of being removed.
  * We must handle this carefully so that we attempt to copy freed data,
  * and we correctly free already-copied data.
  */
 void
 free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t txg = spa_syncing_txg(spa);
 	uint64_t max_offset_yet = 0;
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
 	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Remove the segment from the removing vdev's spacemap.  This
 	 * ensures that we will not attempt to copy this space (if the
 	 * removal thread has not yet visited it), and also ensures
 	 * that we know what is actually allocated on the new vdevs
 	 * (needed if we cancel the removal).
 	 *
 	 * Note: we must do the metaslab_free_concrete() with the svr_lock
 	 * held, so that the remove_thread can not load this metaslab and then
 	 * visit this offset between the time that we metaslab_free_concrete()
 	 * and when we check to see if it has been visited.
 	 *
 	 * Note: The checkpoint flag is set to false as having/taking
 	 * a checkpoint and removing a device can't happen at the same
 	 * time.
 	 */
 	ASSERT(!spa_has_checkpoint(spa));
 	metaslab_free_concrete(vd, offset, size, B_FALSE);
 
 	uint64_t synced_size = 0;
 	uint64_t synced_offset = 0;
 	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
 	if (offset < max_offset_synced) {
 		/*
 		 * The mapping for this offset is already on disk.
 		 * Free from the new location.
 		 *
 		 * Note that we use svr_max_synced_offset because it is
 		 * updated atomically with respect to the in-core mapping.
 		 * By contrast, vim_max_offset is not.
 		 *
 		 * This block may be split between a synced entry and an
 		 * in-flight or unvisited entry.  Only process the synced
 		 * portion of it here.
 		 */
 		synced_size = MIN(size, max_offset_synced - offset);
 		synced_offset = offset;
 
 		ASSERT3U(max_offset_yet, <=, max_offset_synced);
 		max_offset_yet = max_offset_synced;
 
 		DTRACE_PROBE3(remove__free__synced,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, synced_size);
 
 		size -= synced_size;
 		offset += synced_size;
 	}
 
 	/*
 	 * Look at all in-flight txgs starting from the currently syncing one
 	 * and see if a section of this free is being copied. By starting from
 	 * this txg and iterating forward, we might find that this region
 	 * was copied in two different txgs and handle it appropriately.
 	 */
 	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
 		int txgoff = (txg + i) & TXG_MASK;
 		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
 			/*
 			 * The mapping for this offset is in flight, and
 			 * will be synced in txg+i.
 			 */
 			uint64_t inflight_size = MIN(size,
 			    svr->svr_max_offset_to_sync[txgoff] - offset);
 
 			DTRACE_PROBE4(remove__free__inflight,
 			    spa_t *, spa,
 			    uint64_t, offset,
 			    uint64_t, inflight_size,
 			    uint64_t, txg + i);
 
 			/*
 			 * We copy data in order of increasing offset.
 			 * Therefore the max_offset_to_sync[] must increase
 			 * (or be zero, indicating that nothing is being
 			 * copied in that txg).
 			 */
 			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
 				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
 				    >=, max_offset_yet);
 				max_offset_yet =
 				    svr->svr_max_offset_to_sync[txgoff];
 			}
 
 			/*
 			 * We've already committed to copying this segment:
 			 * we have allocated space elsewhere in the pool for
 			 * it and have an IO outstanding to copy the data. We
 			 * cannot free the space before the copy has
 			 * completed, or else the copy IO might overwrite any
 			 * new data. To free that space, we record the
 			 * segment in the appropriate svr_frees tree and free
 			 * the mapped space later, in the txg where we have
 			 * completed the copy and synced the mapping (see
 			 * vdev_mapping_sync).
 			 */
 			range_tree_add(svr->svr_frees[txgoff],
 			    offset, inflight_size);
 			size -= inflight_size;
 			offset += inflight_size;
 
 			/*
 			 * This space is already accounted for as being
 			 * done, because it is being copied in txg+i.
 			 * However, if i!=0, then it is being copied in
 			 * a future txg.  If we crash after this txg
 			 * syncs but before txg+i syncs, then the space
 			 * will be free.  Therefore we must account
 			 * for the space being done in *this* txg
 			 * (when it is freed) rather than the future txg
 			 * (when it will be copied).
 			 */
 			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
 			    inflight_size);
 			svr->svr_bytes_done[txgoff] -= inflight_size;
 			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
 		}
 	}
 	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
 
 	if (size > 0) {
 		/*
 		 * The copy thread has not yet visited this offset.  Ensure
 		 * that it doesn't.
 		 */
 
 		DTRACE_PROBE3(remove__free__unvisited,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, size);
 
 		if (svr->svr_allocd_segs != NULL)
 			range_tree_clear(svr->svr_allocd_segs, offset, size);
 
 		/*
 		 * Since we now do not need to copy this data, for
 		 * accounting purposes we have done our job and can count
 		 * it as completed.
 		 */
 		svr->svr_bytes_done[txg & TXG_MASK] += size;
 	}
 	mutex_exit(&svr->svr_lock);
 
 	/*
 	 * Now that we have dropped svr_lock, process the synced portion
 	 * of this free.
 	 */
 	if (synced_size > 0) {
 		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
 
 		/*
 		 * Note: this can only be called from syncing context,
 		 * and the vdev_indirect_mapping is only changed from the
 		 * sync thread, so we don't need svr_lock while doing
 		 * metaslab_free_impl_cb.
 		 */
 		boolean_t checkpoint = B_FALSE;
 		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
 		    metaslab_free_impl_cb, &checkpoint);
 	}
 }
 
 /*
  * Stop an active removal and update the spa_removing phys.
  */
 static void
 spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
 
 	/* Ensure the removal thread has completed before we free the svr. */
 	spa_vdev_remove_suspend(spa);
 
 	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
 		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != -1) {
 			vdev_t *pvd;
 			pvd = vdev_lookup_top(spa,
 			    srp->sr_prev_indirect_vdev);
 			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
 		}
 
 		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
 		srp->sr_prev_indirect_vdev = vd->vdev_id;
 	}
 	spa->spa_removing_phys.sr_state = state;
 	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
 
 	spa->spa_vdev_removal = NULL;
 	spa_vdev_removal_destroy(svr);
 
 	spa_sync_removing_state(spa, tx);
 	spa_notify_waiters(spa);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 	vdev_indirect_mark_obsolete(vd, offset, size);
 	boolean_t checkpoint = B_FALSE;
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
  * On behalf of the removal thread, syncs an incremental bit more of
  * the indirect mapping to disk and updates the in-memory mapping.
  * Called as a sync task in every txg that the removal thread makes progress.
  */
 static void
 vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT(vic->vic_mapping_object != 0);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	vdev_indirect_mapping_add_entries(vim,
 	    &svr->svr_new_segments[txg & TXG_MASK], tx);
 	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
 	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
 
 	/*
 	 * Free the copied data for anything that was freed while the
 	 * mapping entries were in flight.
 	 */
 	mutex_enter(&svr->svr_lock);
 	range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
 	    free_mapped_segment_cb, vd);
 	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
 	    vdev_indirect_mapping_max_offset(vim));
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
 	mutex_exit(&svr->svr_lock);
 
 	spa_sync_removing_state(spa, tx);
 }
 
 typedef struct vdev_copy_segment_arg {
 	spa_t *vcsa_spa;
 	dva_t *vcsa_dest_dva;
 	uint64_t vcsa_txg;
 	range_tree_t *vcsa_obsolete_segs;
 } vdev_copy_segment_arg_t;
 
 static void
 unalloc_seg(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_copy_segment_arg_t *vcsa = arg;
 	spa_t *spa = vcsa->vcsa_spa;
 	blkptr_t bp = { { { {0} } } };
 
 	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
 	BP_SET_LSIZE(&bp, size);
 	BP_SET_PSIZE(&bp, size);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(&bp, DMU_OT_NONE);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_DEDUP(&bp, 0);
 	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
 
 	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
 	DVA_SET_OFFSET(&bp.blk_dva[0],
 	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
 	DVA_SET_ASIZE(&bp.blk_dva[0], size);
 
 	zio_free(spa, vcsa->vcsa_txg, &bp);
 }
 
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
 spa_vdev_copy_segment_done(zio_t *zio)
 {
 	vdev_copy_segment_arg_t *vcsa = zio->io_private;
 
 	range_tree_vacate(vcsa->vcsa_obsolete_segs,
 	    unalloc_seg, vcsa);
 	range_tree_destroy(vcsa->vcsa_obsolete_segs);
 	kmem_free(vcsa, sizeof (*vcsa));
 
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
 /*
  * The write of the new location is done.
  */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 
 	if (zio->io_error != 0)
 		vca->vca_write_error_bytes += zio->io_size;
 
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
 }
 
 /*
  * The read of the old location is done.  The parent zio is the write to
  * the new location.  Allow it to start.
  */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	if (zio->io_error != 0) {
 		mutex_enter(&vca->vca_lock);
 		vca->vca_read_error_bytes += zio->io_size;
 		mutex_exit(&vca->vca_lock);
 	}
 
 	zio_nowait(zio_unique_parent(zio));
 }
 
 /*
  * If the old and new vdevs are mirrors, we will read both sides of the old
  * mirror, and write each copy to the corresponding side of the new mirror.
  * If the old and new vdevs have a different number of children, we will do
  * this as best as possible.  Since we aren't verifying checksums, this
  * ensures that as long as there's a good copy of the data, we'll have a
  * good copy after the removal, even if there's silent damage to one side
  * of the mirror. If we're removing a mirror that has some silent damage,
  * we'll have exactly the same damage in the new location (assuming that
  * the new location is also a mirror).
  *
  * We accomplish this by creating a tree of zio_t's, with as many writes as
  * there are "children" of the new vdev (a non-redundant vdev counts as one
  * child, a 2-way mirror has 2 children, etc). Each write has an associated
  * read from a child of the old vdev. Typically there will be the same
  * number of children of the old and new vdevs.  However, if there are more
  * children of the new vdev, some child(ren) of the old vdev will be issued
  * multiple reads.  If there are more children of the old vdev, some copies
  * will be dropped.
  *
  * For example, the tree of zio_t's for a 2-way mirror is:
  *
  *                            null
  *                           /    \
  *    write(new vdev, child 0)      write(new vdev, child 1)
  *      |                             |
  *    read(old vdev, child 0)       read(old vdev, child 1)
  *
  * Child zio's complete before their parents complete.  However, zio's
  * created with zio_vdev_child_io() may be issued before their children
  * complete.  In this case we need to make sure that the children (reads)
  * complete before the parents (writes) are *issued*.  We do this by not
  * calling zio_nowait() on each write until its corresponding read has
  * completed.
  *
  * The spa_config_lock must be held while zio's created by
  * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
  * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
  * zio is needed to release the spa_config_lock after all the reads and
  * writes complete. (Note that we can't grab the config lock for each read,
  * because it is not reentrant - we could deadlock with a thread waiting
  * for a write lock.)
  */
 static void
 spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
     vdev_t *source_vd, uint64_t source_offset,
     vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
 {
 	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
 	/*
 	 * If the destination child in unwritable then there is no point
 	 * in issuing the source reads which cannot be written.
 	 */
 	if (!vdev_writeable(dest_child_vd))
 		return;
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes += size;
 	mutex_exit(&vca->vca_lock);
 
 	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
 	vdev_t *source_child_vd = NULL;
 	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
 		/*
 		 * Source and dest are both mirrors.  Copy from the same
 		 * child id as we are copying to (wrapping around if there
 		 * are more dest children than source children).  If the
 		 * preferred source child is unreadable select another.
 		 */
 		for (int i = 0; i < source_vd->vdev_children; i++) {
 			source_child_vd = source_vd->vdev_child[
 			    (dest_id + i) % source_vd->vdev_children];
 			if (vdev_readable(source_child_vd))
 				break;
 		}
 	} else {
 		source_child_vd = source_vd;
 	}
 
 	/*
 	 * There should always be at least one readable source child or
 	 * the pool would be in a suspended state.  Somehow selecting an
 	 * unreadable child would result in IO errors, the removal process
 	 * being cancelled, and the pool reverting to its pre-removal state.
 	 */
 	ASSERT3P(source_child_vd, !=, NULL);
 
 	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
 	    dest_child_vd, dest_offset, abd, size,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_write_done, vca);
 
 	zio_nowait(zio_vdev_child_io(write_zio, NULL,
 	    source_child_vd, source_offset, abd, size,
 	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_read_done, vca));
 }
 
 /*
  * Allocate a new location for this segment, and create the zio_t's to
  * read from the old location and write to the new location.
  */
 static int
 spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
     uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
 	dva_t dst = {{ 0 }};
 	uint64_t start = range_tree_min(segs);
 	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
 
 	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
 
 	uint64_t size = range_tree_span(segs);
 	if (range_tree_span(segs) > maxalloc) {
 		/*
 		 * We can't allocate all the segments.  Prefer to end
 		 * the allocation at the end of a segment, thus avoiding
 		 * additional split blocks.
 		 */
 		range_seg_max_t search;
 		zfs_btree_index_t where;
 		rs_set_start(&search, segs, start + maxalloc);
 		rs_set_end(&search, segs, start + maxalloc);
 		(void) zfs_btree_find(&segs->rt_root, &search, &where);
 		range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
 		    &where);
 		if (rs != NULL) {
 			size = rs_get_end(rs, segs) - start;
 		} else {
 			/*
 			 * There are no segments that end before maxalloc.
 			 * I.e. the first segment is larger than maxalloc,
 			 * so we must split it.
 			 */
 			size = maxalloc;
 		}
 	}
 	ASSERT3U(size, <=, maxalloc);
 	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
 
 	/*
 	 * An allocation class might not have any remaining vdevs or space
 	 */
 	metaslab_class_t *mc = mg->mg_class;
 	if (mc->mc_groups == 0)
 		mc = spa_normal_class(spa);
 	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
 	    METASLAB_DONT_THROTTLE, zal, 0);
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
 		    &dst, 0, NULL, txg, METASLAB_DONT_THROTTLE, zal, 0);
 	}
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Determine the ranges that are not actually needed.  Offsets are
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
 	range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
 	    0, 0);
 
 	zfs_btree_index_t where;
 	range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
 	ASSERT3U(rs_get_start(rs, segs), ==, start);
 	uint64_t prev_seg_end = rs_get_end(rs, segs);
 	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
 		if (rs_get_start(rs, segs) >= start + size) {
 			break;
 		} else {
 			range_tree_add(obsolete_segs,
 			    prev_seg_end - start,
 			    rs_get_start(rs, segs) - prev_seg_end);
 		}
 		prev_seg_end = rs_get_end(rs, segs);
 	}
 	/* We don't end in the middle of an obsolete range */
 	ASSERT3U(start + size, <=, prev_seg_end);
 
 	range_tree_clear(segs, start, size);
 
 	/*
 	 * We can't have any padding of the allocated size, otherwise we will
 	 * misunderstand what's allocated, and the size of the mapping. We
 	 * prevent padding by ensuring that all devices in the pool have the
 	 * same ashift, and the allocation size is a multiple of the ashift.
 	 */
 	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
 
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
 	}
 
 	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
 	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
 	vcsa->vcsa_obsolete_segs = obsolete_segs;
 	vcsa->vcsa_spa = spa;
 	vcsa->vcsa_txg = txg;
 
 	/*
 	 * See comment before spa_vdev_copy_one_child().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
 	    spa_vdev_copy_segment_done, vcsa, 0);
 	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
 	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < dest_vd->vdev_children; i++) {
 			vdev_t *child = dest_vd->vdev_child[i];
 			spa_vdev_copy_one_child(vca, nzio, vd, start,
 			    child, DVA_GET_OFFSET(&dst), i, size);
 		}
 	} else {
 		spa_vdev_copy_one_child(vca, nzio, vd, start,
 		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
 	}
 	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
 	vdev_dirty(vd, 0, NULL, txg);
 
 	return (0);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called as a
  * synctask in the same txg that we will sync out the new config (to the
  * MOS object) which indicates that this vdev is indirect.
  */
 static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 	}
 
 	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
 	    spa->spa_removing_phys.sr_to_copy);
 
 	vdev_destroy_spacemaps(vd, tx);
 
 	/* destroy leaf zaps, if any */
 	ASSERT3P(svr->svr_zaplist, !=, NULL);
 	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
 		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
 	}
 	fnvlist_free(svr->svr_zaplist);
 
 	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
 	/* vd->vdev_path is not available here */
 	spa_history_log_internal(spa, "vdev remove completed",  tx,
 	    "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
 }
 
 static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
 	ASSERT0(vdev_get_nparity(vd));
 
 	if (vd->vdev_leaf_zap != 0) {
 		char zkey[32];
 		(void) snprintf(zkey, sizeof (zkey), "%s-%llu",
 		    VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
 		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
 	}
 
 	for (uint64_t id = 0; id < vd->vdev_children; id++) {
 		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
 	}
 }
 
 static void
 vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 {
 	vdev_t *ivd;
 	dmu_tx_t *tx;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	/*
 	 * First, build a list of leaf zaps to be destroyed.
 	 * This is passed to the sync context thread,
 	 * which does the actual unlinking.
 	 */
 	svr->svr_zaplist = fnvlist_alloc();
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
 	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread = NULL;
 	cv_broadcast(&svr->svr_cv);
 	mutex_exit(&svr->svr_lock);
 
 	/* After this, we can not use svr. */
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_complete_sync, svr, tx);
 	dmu_tx_commit(tx);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called in open
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
 vdev_remove_complete(spa_t *spa)
 {
 	uint64_t txg;
 
 	/*
 	 * Wait for any deferred frees to be synced before we call
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	vdev_rebuild_stop_wait(vd);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 	uint64_t vdev_space = spa_deflate(spa) ?
 	    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
 
 	ASSERT3U(0, !=, vdev_space);
 	ASSERT3U(spa->spa_nonallocating_dspace, >=, vdev_space);
 
 	/* the vdev is no longer part of the dspace */
 	spa->spa_nonallocating_dspace -= vdev_space;
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 
 	vdev_remove_replace_with_indirect(vd, txg);
 
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
 	 *
 	 * Note that we hold on to the vdev_t that has been replaced.  Since
 	 * it isn't part of the vdev tree any longer, it can't be concurrently
 	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	/*
 	 * Top ZAP should have been transferred to the indirect vdev in
 	 * vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_top_zap);
 
 	/*
 	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	txg = spa_vdev_enter(spa);
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 	/*
 	 * Request to update the config and the config cachefile.
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 }
 
 /*
  * Evacuates a segment of size at most max_alloc from the vdev
  * via repeated calls to spa_vdev_copy_segment. If an allocation
  * fails, the pool is probably too fragmented to handle such a
  * large size, so decrease max_alloc so that the caller will not try
  * this size again this txg.
  */
 static void
 spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Determine how big of a chunk to copy.  We can allocate up
 	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
 	 * bytes of unallocated space at a time.  "segs" will track the
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
 	range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
 	for (;;) {
 		range_tree_t *rt = svr->svr_allocd_segs;
 		range_seg_t *rs = range_tree_first(rt);
 
 		if (rs == NULL)
 			break;
 
 		uint64_t seg_length;
 
 		if (range_tree_is_empty(segs)) {
 			/* need to truncate the first seg based on max_alloc */
 			seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
 			    rt), *max_alloc);
 		} else {
 			if (rs_get_start(rs, rt) - range_tree_max(segs) >
 			    vdev_removal_max_span) {
 				/*
 				 * Including this segment would cause us to
 				 * copy a larger unneeded chunk than is allowed.
 				 */
 				break;
 			} else if (rs_get_end(rs, rt) - range_tree_min(segs) >
 			    *max_alloc) {
 				/*
 				 * This additional segment would extend past
 				 * max_alloc. Rather than splitting this
 				 * segment, leave it for the next mapping.
 				 */
 				break;
 			} else {
 				seg_length = rs_get_end(rs, rt) -
 				    rs_get_start(rs, rt);
 			}
 		}
 
 		range_tree_add(segs, rs_get_start(rs, rt), seg_length);
 		range_tree_remove(svr->svr_allocd_segs,
 		    rs_get_start(rs, rt), seg_length);
 	}
 
 	if (range_tree_is_empty(segs)) {
 		mutex_exit(&svr->svr_lock);
 		range_tree_destroy(segs);
 		return;
 	}
 
 	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
 		    svr, tx);
 	}
 
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
 
 	/*
 	 * Note: this is the amount of *allocated* space
 	 * that we are taking care of each txg.
 	 */
 	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
 
 	mutex_exit(&svr->svr_lock);
 
 	zio_alloc_list_t zal;
 	metaslab_trace_init(&zal);
 	uint64_t thismax = SPA_MAXBLOCKSIZE;
 	while (!range_tree_is_empty(segs)) {
 		int error = spa_vdev_copy_segment(vd,
 		    segs, thismax, txg, vca, &zal);
 
 		if (error == ENOSPC) {
 			/*
 			 * Cut our segment in half, and don't try this
 			 * segment size again this txg.  Note that the
 			 * allocation size must be aligned to the highest
 			 * ashift in the pool, so that the allocation will
 			 * not be padded out to a multiple of the ashift,
 			 * which could cause us to think that this mapping
 			 * is larger than we intended.
 			 */
 			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
 			uint64_t attempted =
 			    MIN(range_tree_span(segs), thismax);
 			thismax = P2ROUNDUP(attempted / 2,
 			    1 << spa->spa_max_ashift);
 			/*
 			 * The minimum-size allocation can not fail.
 			 */
 			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
 			*max_alloc = attempted - (1 << spa->spa_max_ashift);
 		} else {
 			ASSERT0(error);
 
 			/*
 			 * We've performed an allocation, so reset the
 			 * alloc trace list.
 			 */
 			metaslab_trace_fini(&zal);
 			metaslab_trace_init(&zal);
 		}
 	}
 	metaslab_trace_fini(&zal);
 	range_tree_destroy(segs);
 }
 
 /*
  * The size of each removal mapping is limited by the tunable
  * zfs_remove_max_segment, but we must adjust this to be a multiple of the
  * pool's ashift, so that we don't try to split individual sectors regardless
  * of the tunable value.  (Note that device removal requires that all devices
  * have the same ashift, so there's no difference between spa_min_ashift and
  * spa_max_ashift.) The raw tunable should not be used elsewhere.
  */
 uint64_t
 spa_remove_max_segment(spa_t *spa)
 {
 	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
 }
 
 /*
  * The removal thread operates in open context.  It iterates over all
  * allocated space in the vdev, by loading each metaslab's spacemap.
  * For each contiguous segment of allocated space (capping the segment
  * size at SPA_MAXBLOCKSIZE), we:
  *    - Allocate space for it on another vdev.
  *    - Create a new mapping from the old location to the new location
  *      (as a record in svr_new_segments).
  *    - Initiate a physical read zio to get the data off the removing disk.
  *    - In the read zio's done callback, initiate a physical write zio to
  *      write it to the new vdev.
  * Note that all of this will take effect when a particular TXG syncs.
  * The sync thread ensures that all the phys reads and writes for the syncing
  * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
  * (see vdev_mapping_sync()).
  */
 static __attribute__((noreturn)) void
 spa_vdev_remove_thread(void *arg)
 {
 	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = spa_remove_max_segment(spa);
 	uint64_t last_txg = 0;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
 	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
 	vca.vca_outstanding_bytes = 0;
 	vca.vca_read_error_bytes = 0;
 	vca.vca_write_error_bytes = 0;
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Start from vim_max_offset so we pick up where we left off
 	 * if we are restarting the removal after opening the pool.
 	 */
 	uint64_t msi;
 	for (msi = start_offset >> vd->vdev_ms_shift;
 	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 		ASSERT3U(msi, <=, vd->vdev_ms_count);
 
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
 
 		mutex_enter(&msp->ms_sync_lock);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++) {
 			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		}
 
 		/*
 		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
 		 * read the allocated segments from the space map object
 		 * into svr_allocd_segs. Since we do this while holding
 		 * svr_lock and ms_sync_lock, concurrent frees (which
 		 * would have modified the space map) will wait for us
 		 * to finish loading the spacemap, and then take the
 		 * appropriate action (see free_from_removing_vdev()).
 		 */
 		if (msp->ms_sm != NULL) {
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			range_tree_walk(msp->ms_unflushed_allocs,
 			    range_tree_add, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_unflushed_frees,
 			    range_tree_remove, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
 			/*
 			 * When we are resuming from a paused removal (i.e.
 			 * when importing a pool with a removal in progress),
 			 * discard any state that we have already processed.
 			 */
 			range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
 		}
 		mutex_exit(&msp->ms_lock);
 		mutex_exit(&msp->ms_sync_lock);
 
 		vca.vca_msp = msp;
 		zfs_dbgmsg("copying %llu segments for metaslab %llu",
 		    (u_longlong_t)zfs_btree_numnodes(
 		    &svr->svr_allocd_segs->rt_root),
 		    (u_longlong_t)msp->ms_id);
 
 		while (!svr->svr_thread_exit &&
 		    !range_tree_is_empty(svr->svr_allocd_segs)) {
 
 			mutex_exit(&svr->svr_lock);
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * This delay will pause the removal around the point
 			 * specified by zfs_removal_suspend_progress. We do this
 			 * solely from the test suite or during debugging.
 			 */
 			while (zfs_removal_suspend_progress &&
 			    !svr->svr_thread_exit)
 				delay(hz);
 
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
 				cv_wait(&vca.vca_cv, &vca.vca_lock);
 			}
 			mutex_exit(&vca.vca_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  The vdev_t
 			 * that we're removing may have changed, e.g. due
 			 * to a vdev_attach or vdev_detach.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 			if (txg != last_txg)
 				max_alloc = spa_remove_max_segment(spa);
 			last_txg = txg;
 
 			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
 		}
 
 		mutex_enter(&vca.vca_lock);
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			svr->svr_thread_exit = B_TRUE;
 		}
 		mutex_exit(&vca.vca_lock);
 	}
 
 	mutex_exit(&svr->svr_lock);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	ASSERT0(vca.vca_outstanding_bytes);
 
 	mutex_destroy(&vca.vca_lock);
 	cv_destroy(&vca.vca_cv);
 
 	if (svr->svr_thread_exit) {
 		mutex_enter(&svr->svr_lock);
 		range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
 		svr->svr_thread = NULL;
 		cv_broadcast(&svr->svr_cv);
 		mutex_exit(&svr->svr_lock);
 
 		/*
 		 * During the removal process an unrecoverable read or write
 		 * error was encountered.  The removal process must be
 		 * cancelled or this damage may become permanent.
 		 */
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			zfs_dbgmsg("canceling removal due to IO errors: "
 			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
 			    (u_longlong_t)vca.vca_read_error_bytes,
 			    (u_longlong_t)vca.vca_write_error_bytes);
 			spa_vdev_remove_cancel_impl(spa);
 		}
 	} else {
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
 		vdev_remove_complete(spa);
 	}
 
 	thread_exit();
 }
 
 void
 spa_vdev_remove_suspend(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread_exit = B_TRUE;
 	while (svr->svr_thread != NULL)
 		cv_wait(&svr->svr_cv, &svr->svr_lock);
 	svr->svr_thread_exit = B_FALSE;
 	mutex_exit(&svr->svr_lock);
 }
 
 /*
  * Return true if the "allocating" property has been set to "off"
  */
 static boolean_t
 vdev_prop_allocating_off(vdev_t *vd)
 {
 	uint64_t objid = vd->vdev_top_zap;
 	uint64_t allocating = 1;
 
 	/* no vdev property object => no props */
 	if (objid != 0) {
 		spa_t *spa = vd->vdev_spa;
 		objset_t *mos = spa->spa_meta_objset;
 
 		mutex_enter(&spa->spa_props_lock);
 		(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
 		    1, &allocating);
 		mutex_exit(&spa->spa_props_lock);
 	}
 	return (allocating == 0);
 }
 
 static int
 spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 	return (0);
 }
 
 /*
  * Cancel a removal by freeing all entries from the partial mapping
  * and marking the vdev as no longer being removing.
  */
 static void
 spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
 
 	ASSERT3P(svr->svr_thread, ==, NULL);
 
 	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 
 	boolean_t are_precise;
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (are_precise) {
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
 	}
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 
 		space_map_free(vd->vdev_obsolete_sm, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	}
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
 		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
 		    vdev_indirect_mapping_max_offset(vim));
 	}
 
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
 			break;
 
 		ASSERT0(range_tree_space(svr->svr_allocd_segs));
 
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT0(range_tree_space(msp->ms_allocating[i]));
 		for (int i = 0; i < TXG_DEFER_SIZE; i++)
 			ASSERT0(range_tree_space(msp->ms_defer[i]));
 		ASSERT0(range_tree_space(msp->ms_freed));
 
 		if (msp->ms_sm != NULL) {
 			mutex_enter(&svr->svr_lock);
 			VERIFY0(space_map_load(msp->ms_sm,
 			    svr->svr_allocd_segs, SM_ALLOC));
 
 			range_tree_walk(msp->ms_unflushed_allocs,
 			    range_tree_add, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_unflushed_frees,
 			    range_tree_remove, svr->svr_allocd_segs);
 			range_tree_walk(msp->ms_freeing,
 			    range_tree_remove, svr->svr_allocd_segs);
 
 			/*
 			 * Clear everything past what has been synced,
 			 * because we have not allocated mappings for it yet.
 			 */
 			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
 			uint64_t sm_end = msp->ms_sm->sm_start +
 			    msp->ms_sm->sm_size;
 			if (sm_end > syncd)
 				range_tree_clear(svr->svr_allocd_segs,
 				    syncd, sm_end - syncd);
 
 			mutex_exit(&svr->svr_lock);
 		}
 		mutex_exit(&msp->ms_lock);
 
 		mutex_enter(&svr->svr_lock);
 		range_tree_vacate(svr->svr_allocd_segs,
 		    free_mapped_segment_cb, vd);
 		mutex_exit(&svr->svr_lock);
 	}
 
 	/*
 	 * Note: this must happen after we invoke free_mapped_segment_cb,
 	 * because it adds to the obsolete_segments.
 	 */
 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 
 	ASSERT3U(vic->vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 	vd->vdev_indirect_mapping = NULL;
 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
 	vic->vic_mapping_object = 0;
 
 	ASSERT3U(vic->vic_births_object, ==,
 	    vdev_indirect_births_object(vd->vdev_indirect_births));
 	vdev_indirect_births_close(vd->vdev_indirect_births);
 	vd->vdev_indirect_births = NULL;
 	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
 	vic->vic_births_object = 0;
 
 	/*
 	 * We may have processed some frees from the removing vdev in this
 	 * txg, thus increasing svr_bytes_done; discard that here to
 	 * satisfy the assertions in spa_vdev_removal_destroy().
 	 * Note that future txg's can not have any bytes_done, because
 	 * future TXG's are only modified from open context, and we have
 	 * already shut down the copying thread.
 	 */
 	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
 	spa_finish_removal(spa, DSS_CANCELED, tx);
 
 	vd->vdev_removing = B_FALSE;
 
 	if (!vdev_prop_allocating_off(vd)) {
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
 		vdev_activate(vd);
 		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
 	}
 
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx));
 	spa_history_log_internal(spa, "vdev remove canceled", tx,
 	    "%s vdev %llu %s", spa_name(spa),
 	    (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
 static int
 spa_vdev_remove_cancel_impl(spa_t *spa)
 {
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	return (error);
 }
 
 int
 spa_vdev_remove_cancel(spa_t *spa)
 {
 	spa_vdev_remove_suspend(spa);
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 
 	return (spa_vdev_remove_cancel_impl(spa));
 }
 
 void
 svr_sync(spa_t *spa, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * This check is necessary so that we do not dirty the
 	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
 	 * is nothing to do.  Dirtying it every time would prevent us
 	 * from syncing-to-convergence.
 	 */
 	if (svr->svr_bytes_done[txgoff] == 0)
 		return;
 
 	/*
 	 * Update progress accounting.
 	 */
 	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
 	svr->svr_bytes_done[txgoff] = 0;
 
 	spa_sync_removing_state(spa, tx);
 }
 
 static void
 vdev_remove_make_hole_and_free(vdev_t *vd)
 {
 	uint64_t id = vd->vdev_id;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	vdev_free(vd);
 
 	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 	vdev_add_child(rvd, vd);
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a log device.  The config lock is held for the specified TXG.
  */
 static int
 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3P(vd->vdev_log_mg, ==, NULL);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop allocating from this vdev.
 	 */
 	metaslab_group_passivate(mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * Cancel any initialize or TRIM which was in progress.
 	 */
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
 	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
 	vdev_autotrim_stop_wait(vd);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as
 	 * writer since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (vd->vdev_stat.vs_alloc != 0)
 		error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT3P(vd->vdev_log_mg, ==, NULL);
 		return (error);
 	}
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 
 	/*
 	 * When the log space map feature is enabled we look at
 	 * the vdev's top_zap to find the on-disk flush data of
 	 * the metaslab we just flushed. Thus, while removing a
 	 * log vdev we make sure to call vdev_metaslab_fini()
 	 * first, which removes all metaslabs of this vdev from
 	 * spa_metaslabs_by_flushed before vdev_remove_empty()
 	 * destroys the top_zap of this log vdev.
 	 *
 	 * This avoids the scenario where we flush a metaslab
 	 * from the log vdev being removed that doesn't have a
 	 * top_zap and end up failing to lookup its on-disk flush
 	 * data.
 	 *
 	 * We don't call metaslab_group_destroy() right away
 	 * though (it will be called in vdev_free() later) as
 	 * during metaslab_sync() of metaslabs from other vdevs
 	 * we may touch the metaslab group of this vdev through
 	 * metaslab_class_histogram_verify()
 	 */
 	vdev_metaslab_fini(vd);
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/* The top ZAP should have been destroyed by vdev_remove_empty. */
 	ASSERT0(vd->vdev_top_zap);
 	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Clean up the vdev namespace.
 	 */
 	vdev_remove_make_hole_and_free(vd);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (0);
 }
 
 static int
 spa_vdev_remove_top_check(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd != vd->vdev_top)
 		return (SET_ERROR(ENOTSUP));
 
 	if (!vdev_is_concrete(vd))
 		return (SET_ERROR(ENOTSUP));
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * This device is already being removed
 	 */
 	if (vd->vdev_removing)
 		return (SET_ERROR(EALREADY));
 
 	metaslab_class_t *mc = vd->vdev_mg->mg_class;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mc != normal) {
 		/*
 		 * Space allocated from the special (or dedup) class is
 		 * included in the DMU's space usage, but it's not included
 		 * in spa_dspace (or dsl_pool_adjustedsize()).  Therefore
 		 * there is always at least as much free space in the normal
 		 * class, as is allocated from the special (and dedup) class.
 		 * As a backup check, we will return ENOSPC if this is
 		 * violated. See also spa_update_dspace().
 		 */
 		uint64_t available = metaslab_class_get_space(normal) -
 		    metaslab_class_get_alloc(normal);
 		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
 		if (available < vd->vdev_stat.vs_alloc)
 			return (SET_ERROR(ENOSPC));
 	} else if (!vd->vdev_noalloc) {
 		/* available space in the pool's normal class */
 		uint64_t available = dsl_dir_space_available(
 		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
 		if (available < vd->vdev_stat.vs_dspace)
 			return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * There can not be a removal in progress.
 	 */
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must have all its data.
 	 */
 	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
 	    !vdev_dtl_empty(vd, DTL_OUTAGE))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must be healthy.
 	 */
 	if (!vdev_readable(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * All vdevs in normal class must have the same ashift.
 	 */
 	if (spa->spa_max_ashift != spa->spa_min_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A removed special/dedup vdev must have same ashift as normal class.
 	 */
 	ASSERT(!vd->vdev_islog);
 	if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 	    vd->vdev_ashift != spa->spa_max_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * All vdevs in normal class must have the same ashift
 	 * and not be raidz or draid.
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 		vdev_t *cvd = rvd->vdev_child[id];
 
 		/*
 		 * A removed special/dedup vdev must have the same ashift
 		 * across all vdevs in its class.
 		 */
 		if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 		    cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
 		    cvd->vdev_ashift != vd->vdev_ashift) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (cvd->vdev_ashift != 0 &&
 		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
 		if (!vdev_is_concrete(cvd))
 			continue;
 		if (vdev_get_nparity(cvd) != 0)
 			return (SET_ERROR(EINVAL));
 		/*
 		 * Need the mirror to be mirror of leaf vdevs only
 		 */
 		if (cvd->vdev_ops == &vdev_mirror_ops) {
 			for (uint64_t cid = 0;
 			    cid < cvd->vdev_children; cid++) {
 				if (!cvd->vdev_child[cid]->vdev_ops->
 				    vdev_op_leaf)
 					return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Initiate removal of a top-level vdev, reducing the total space in the pool.
  * The config lock is held for the specified TXG.  Once initiated,
  * evacuation of all allocated space (copying it to other vdevs) happens
  * in the background (see spa_vdev_remove_thread()), and can be canceled
  * (see spa_vdev_remove_cancel()).  If successful, the vdev will
  * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
  */
 static int
 spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t set_noalloc = B_FALSE;
 	int error;
 
 	/*
 	 * Check for errors up-front, so that we don't waste time
 	 * passivating the metaslab group and clearing the ZIL if there
 	 * are errors.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	/*
 	 * Stop allocating from this vdev.  Note that we must check
 	 * that this is not the only device in the pool before
 	 * passivating, otherwise we will not be able to make
 	 * progress because we can't allocate from any vdevs.
 	 * The above check for sufficient free space serves this
 	 * purpose.
 	 */
 	if (error == 0 && !vd->vdev_noalloc) {
 		set_noalloc = B_TRUE;
 		error = vdev_passivate(vd, txg);
 	}
 
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We stop any initializing and TRIM that is currently in progress
 	 * but leave the state as "active". This will allow the process to
 	 * resume if the removal is canceled sometime later.
 	 */
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
 	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
 	vdev_autotrim_stop_wait(vd);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
 	 * Things might have changed while the config lock was dropped
 	 * (e.g. space usage).  Check for errors again.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	if (error != 0) {
 		if (set_noalloc)
 			vdev_activate(vd);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 		return (error);
 	}
 
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Remove a device from the pool.
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0, error_log;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 	sysevent_t *ev = NULL;
 	const char *vd_type = NULL;
 	char *vd_path = NULL;
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 
 		if (!locked)
 			return (spa_vdev_exit(spa, NULL, txg, error));
 
 		return (error);
 	}
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			const char *type;
 			boolean_t draid_spare = B_FALSE;
 
 			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
 			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
 				draid_spare = B_TRUE;
 
 			if (vd == NULL && draid_spare) {
 				error = SET_ERROR(ENOTSUP);
 			} else {
 				if (vd == NULL)
 					vd = spa_lookup_by_guid(spa,
 					    guid, B_TRUE);
 				ev = spa_event_create(spa, vd, NULL,
 				    ESC_ZFS_VDEV_REMOVE_AUX);
 
 				vd_type = VDEV_TYPE_SPARE;
 				vd_path = spa_strdup(fnvlist_lookup_string(
 				    nv, ZPOOL_CONFIG_PATH));
 				spa_vdev_remove_aux(spa->spa_spares.sav_config,
 				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 				spa_load_spares(spa);
 				spa->spa_spares.sav_sync = B_TRUE;
 			}
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		vd_type = VDEV_TYPE_L2CACHE;
 		vd_path = spa_strdup(fnvlist_lookup_string(
 		    nv, ZPOOL_CONFIG_PATH));
 		/*
 		 * Cache devices can always be removed.
 		 */
 		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
 
 		/*
 		 * Stop trimming the cache device. We need to release the
 		 * config lock to allow the syncing of TRIM transactions
 		 * without releasing the spa_namespace_lock. The same
 		 * strategy is employed in spa_vdev_remove_top().
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 		mutex_enter(&vd->vdev_trim_lock);
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
 		mutex_exit(&vd->vdev_trim_lock);
 		txg = spa_vdev_config_enter(spa);
 
 		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		vd_type = VDEV_TYPE_LOG;
 		vd_path = spa_strdup((vd->vdev_path != NULL) ?
 		    vd->vdev_path : "-");
 		error = spa_vdev_remove_log(vd, &txg);
 	} else if (vd != NULL) {
 		ASSERT(!locked);
 		error = spa_vdev_remove_top(vd, &txg);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	error_log = error;
 
 	if (!locked)
 		error = spa_vdev_exit(spa, NULL, txg, error);
 
 	/*
 	 * Logging must be done outside the spa config lock. Otherwise,
 	 * this code path could end up holding the spa config lock while
 	 * waiting for a txg_sync so it can write to the internal log.
 	 * Doing that would prevent the txg sync from actually happening,
 	 * causing a deadlock.
 	 */
 	if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
 		spa_history_log_internal(spa, "vdev remove", NULL,
 		    "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
 	}
 	if (vd_path != NULL)
 		spa_strfree(vd_path);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (error);
 }
 
 int
 spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 {
 	prs->prs_state = spa->spa_removing_phys.sr_state;
 
 	if (prs->prs_state == DSS_NONE)
 		return (SET_ERROR(ENOENT));
 
 	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
 	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
 	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
 	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
 	prs->prs_copied = spa->spa_removing_phys.sr_copied;
 
 	prs->prs_mapping_memory = 0;
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != -1) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
 	"Ignore hard IO errors when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
 	"Largest contiguous segment to allocate when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
 	"Largest span of free chunks a remap segment can span");
 
-/* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
 	"Pause device removal after this many bytes are copied "
 	"(debug use only - causes removal to hang)");
-/* END CSTYLED */
 
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);
 EXPORT_SYMBOL(spa_restart_removal);
 EXPORT_SYMBOL(spa_vdev_removal_destroy);
 EXPORT_SYMBOL(spa_vdev_remove);
 EXPORT_SYMBOL(spa_vdev_remove_cancel);
 EXPORT_SYMBOL(spa_vdev_remove_suspend);
 EXPORT_SYMBOL(svr_sync);
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index 40e7bcf3ed1f..99fc4ec1928f 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -1,1715 +1,1713 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2023 Alexander Stetsenko <alex.stetsenko@gmail.com>
  * Copyright (c) 2023, Klara Inc.
  */
 
 /*
  * This file contains the top half of the zfs directory structure
  * implementation. The bottom half is in zap_leaf.c.
  *
  * The zdir is an extendable hash data structure. There is a table of
  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
  * each a constant size and hold a variable number of directory entries.
  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
  *
  * The pointer table holds a power of 2 number of pointers.
  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
  * by the pointer at index i in the table holds entries whose hash value
  * has a zd_prefix_len - bit prefix
  */
 
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 
 /*
  * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
  * (all leaf blocks) when we start iterating over it.
  *
  * For zap_cursor_init(), the callers all intend to iterate through all the
  * entries.  There are a few cases where an error (typically i/o error) could
  * cause it to bail out early.
  *
  * For zap_cursor_init_serialized(), there are callers that do the iteration
  * outside of ZFS.  Typically they would iterate over everything, but we
  * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
  * zcp_snapshots_iter(), and other iterators over things in the MOS - these
  * are called by /sbin/zfs and channel programs.  The other example is
  * zfs_readdir() which iterates over directory entries for the getdents()
  * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
  * userland doesn't have to.
  *
  * Given that the ZAP entries aren't returned in a specific order, the only
  * legitimate use cases for partial iteration would be:
  *
  * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
  *    get the first 100 and then wait for the user to hit "next page", which
  *    they may never do).
  *
  * 2. You want to know if there are more than X entries, without relying on
  *    the zfs-specific implementation of the directory's st_size (which is
  *    the number of entries).
  */
 static int zap_iterate_prefetch = B_TRUE;
 
 /*
  * Enable ZAP shrinking. When enabled, empty sibling leaf blocks will be
  * collapsed into a single block.
  */
 int zap_shrink_enabled = B_TRUE;
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 static int zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx);
 
 void
 fzap_byteswap(void *vbuf, size_t size)
 {
 	uint64_t block_type = *(uint64_t *)vbuf;
 
 	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
 		zap_leaf_byteswap(vbuf, size);
 	else {
 		/* it's a ptrtbl block */
 		byteswap_uint64_array(vbuf, size);
 	}
 }
 
 void
 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	zap->zap_ismicro = FALSE;
 
 	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
 	zap->zap_dbu.dbu_evict_func_async = NULL;
 
 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
 	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
 	zap_phys_t *zp = zap_f_phys(zap);
 	/*
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
 	 */
 	memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
 	zp->zap_block_type = ZBT_HEADER;
 	zp->zap_magic = ZAP_MAGIC;
 
 	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
 
 	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
 	zp->zap_num_leafs = 1;
 	zp->zap_num_entries = 0;
 	zp->zap_salt = zap->zap_salt;
 	zp->zap_normflags = zap->zap_normflags;
 	zp->zap_flags = flags;
 
 	/* block 1 will be the first leaf */
 	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
 
 	/*
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
 	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
 	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	l->l_dbuf = db;
 
 	zap_leaf_init(l, zp->zap_normflags != 0);
 
 	kmem_free(l, sizeof (zap_leaf_t));
 	dmu_buf_rele(db, FTAG);
 }
 
 static int
 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
 {
 	if (RW_WRITE_HELD(&zap->zap_rwlock))
 		return (1);
 	if (rw_tryupgrade(&zap->zap_rwlock)) {
 		dmu_buf_will_dirty(zap->zap_dbuf, tx);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Generic routines for dealing with the pointer & cookie tables.
  */
 
 static int
 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
     dmu_tx_t *tx)
 {
 	uint64_t newblk;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int hepb = 1<<(bs-4);
 	/* hepb = half the number of entries in a block */
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
 	ASSERT(tbl->zt_numblks > 0);
 
 	if (tbl->zt_nextblk != 0) {
 		newblk = tbl->zt_nextblk;
 	} else {
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
 		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
 	 * Copy the ptrtbl from the old to new location.
 	 */
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
 	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
 	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
 	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
 	    db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	dmu_buf_rele(db_old, FTAG);
 
 	tbl->zt_blks_copied++;
 
 	dprintf("copied block %llu of %llu\n",
 	    (u_longlong_t)tbl->zt_blks_copied,
 	    (u_longlong_t)tbl->zt_numblks);
 
 	if (tbl->zt_blks_copied == tbl->zt_numblks) {
 		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
 
 		tbl->zt_blk = newblk;
 		tbl->zt_numblks *= 2;
 		tbl->zt_shift++;
 		tbl->zt_nextblk = 0;
 		tbl->zt_blks_copied = 0;
 
 		dprintf("finished; numblocks now %llu (%uk entries)\n",
 		    (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10));
 	}
 
 	return (0);
 }
 
 static int
 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
     dmu_tx_t *tx)
 {
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(tbl->zt_blk != 0);
 
 	dprintf("storing %llx at index %llx\n", (u_longlong_t)val,
 	    (u_longlong_t)idx);
 
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
 	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 	dmu_buf_will_dirty(db, tx);
 
 	if (tbl->zt_nextblk != 0) {
 		uint64_t idx2 = idx * 2;
 		uint64_t blk2 = idx2 >> (bs-3);
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
 		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
 			dmu_buf_rele(db, FTAG);
 			return (err);
 		}
 		dmu_buf_will_dirty(db2, tx);
 		((uint64_t *)db2->db_data)[off2] = val;
 		((uint64_t *)db2->db_data)[off2+1] = val;
 		dmu_buf_rele(db2, FTAG);
 	}
 
 	((uint64_t *)db->db_data)[off] = val;
 	dmu_buf_rele(db, FTAG);
 
 	return (0);
 }
 
 static int
 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 {
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
 	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
 	dmu_buf_rele(db, FTAG);
 
 	if (tbl->zt_nextblk != 0) {
 		/*
 		 * read the nextblk for the sake of i/o error checking,
 		 * so that zap_table_load() will catch errors for
 		 * zap_table_store.
 		 */
 		blk = (idx*2) >> (bs-3);
 
 		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
 	return (err);
 }
 
 /*
  * Routines for growing the ptrtbl.
  */
 
 static void
 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 {
 	for (int i = 0; i < n; i++) {
 		uint64_t lb = src[i];
 		dst[2 * i + 0] = lb;
 		dst[2 * i + 1] = lb;
 	}
 }
 
 static int
 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 {
 	/*
 	 * The pointer table should never use more hash bits than we
 	 * have (otherwise we'd be using useless zero bits to index it).
 	 * If we are within 2 bits of running out, stop growing, since
 	 * this is already an aberrant condition.
 	 */
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 		return (SET_ERROR(ENOSPC));
 
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/*
 		 * We are outgrowing the "embedded" ptrtbl (the one
 		 * stored in the header block).  Give it its own entire
 		 * block, which will double the size of the ptrtbl.
 		 */
 		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
 		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
 			return (err);
 		dmu_buf_will_dirty(db_new, tx);
 		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		dmu_buf_rele(db_new, FTAG);
 
 		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
 		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
 		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 
 		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 		    (FZAP_BLOCK_SHIFT(zap)-3));
 
 		return (0);
 	} else {
 		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    zap_ptrtbl_transfer, tx));
 	}
 }
 
 static void
 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
 {
 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
 	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
 	zap_f_phys(zap)->zap_num_entries += delta;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
 static uint64_t
 zap_allocate_blocks(zap_t *zap, int nblocks)
 {
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
 	zap_f_phys(zap)->zap_freeblk += nblocks;
 	return (newblk);
 }
 
 static void
 zap_leaf_evict_sync(void *dbu)
 {
 	zap_leaf_t *l = dbu;
 
 	rw_destroy(&l->l_rwlock);
 	kmem_free(l, sizeof (zap_leaf_t));
 }
 
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	uint64_t blkid = zap_allocate_blocks(zap, 1);
 	dmu_buf_t *db = NULL;
 
 	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << FZAP_BLOCK_SHIFT(zap), NULL, &db,
 	    DMU_READ_NO_PREFETCH));
 
 	/*
 	 * Create the leaf structure and stash it on the dbuf. If zap was
 	 * recent shrunk or truncated, the dbuf might have been sitting in the
 	 * cache waiting to be evicted, and so still have the old leaf attached
 	 * to it. If so, just reuse it.
 	 */
 	zap_leaf_t *l = dmu_buf_get_user(db);
 	if (l == NULL) {
 		l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 		l->l_blkid = blkid;
 		l->l_dbuf = db;
 		rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
 		dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL,
 		    &l->l_dbuf);
 		dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 	} else {
 		ASSERT3U(l->l_blkid, ==, blkid);
 		ASSERT3P(l->l_dbuf, ==, db);
 	}
 
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
 
 	zap_f_phys(zap)->zap_num_leafs++;
 
 	return (l);
 }
 
 int
 fzap_count(zap_t *zap, uint64_t *count)
 {
 	ASSERT(!zap->zap_ismicro);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
 	*count = zap_f_phys(zap)->zap_num_entries;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 	return (0);
 }
 
 /*
  * Routines for obtaining zap_leaf_t's
  */
 
 void
 zap_put_leaf(zap_leaf_t *l)
 {
 	rw_exit(&l->l_rwlock);
 	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
 	ASSERT(blkid != 0);
 
 	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = blkid;
 	l->l_bs = highbit64(db->db_size) - 1;
 	l->l_dbuf = db;
 
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
 	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
 
 	rw_exit(&l->l_rwlock);
 	if (winner != NULL) {
 		/* someone else set it first */
 		zap_leaf_evict_sync(&l->l_dbu);
 		l = winner;
 	}
 
 	/*
 	 * lhr_pad was previously used for the next leaf in the leaf
 	 * chain.  There should be no chained leafs (as we have removed
 	 * support for them).
 	 */
 	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 
 	/*
 	 * There should be more hash entries than there can be
 	 * chunks to put in the hash table
 	 */
 	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
 
 	/* The chunks should begin at the end of the hash table */
 	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
 	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 
 	/* The chunks should end at the end of the block */
 	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
 	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 
 	return (l);
 }
 
 static int
 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
     zap_leaf_t **lp)
 {
 	dmu_buf_t *db;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	/*
 	 * If system crashed just after dmu_free_long_range in zfs_rmnode, we
 	 * would be left with an empty xattr dir in delete queue. blkid=0
 	 * would be passed in when doing zfs_purgedir. If that's the case we
 	 * should just return immediately. The underlying objects should
 	 * already be freed, so this should be perfectly fine.
 	 */
 	if (blkid == 0)
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	ASSERT3U(db->db_object, ==, zap->zap_object);
 	ASSERT3U(db->db_offset, ==, blkid << bs);
 	ASSERT3U(db->db_size, ==, 1 << bs);
 	ASSERT(blkid != 0);
 
 	zap_leaf_t *l = dmu_buf_get_user(db);
 
 	if (l == NULL)
 		l = zap_open_leaf(blkid, db);
 
 	rw_enter(&l->l_rwlock, lt);
 	/*
 	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 	 * causing ASSERT below to fail.
 	 */
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 	ASSERT3U(l->l_blkid, ==, blkid);
 	ASSERT3P(l->l_dbuf, ==, db);
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	*lp = l;
 	return (0);
 }
 
 static int
 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
 		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 		return (0);
 	} else {
 		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, valp));
 	}
 }
 
 static int
 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
 {
 	ASSERT(tx != NULL);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 		return (0);
 	} else {
 		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, blk, tx));
 	}
 }
 
 static int
 zap_set_idx_range_to_blk(zap_t *zap, uint64_t idx, uint64_t nptrs, uint64_t blk,
     dmu_tx_t *tx)
 {
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	int epb = bs >> 3; /* entries per block */
 	int err = 0;
 
 	ASSERT(tx != NULL);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	/*
 	 * Check for i/o errors
 	 */
 	for (int i = 0; i < nptrs; i += epb) {
 		uint64_t blk;
 		err = zap_idx_to_blk(zap, idx + i, &blk);
 		if (err != 0) {
 			return (err);
 		}
 	}
 
 	for (int i = 0; i < nptrs; i++) {
 		err = zap_set_idx_to_blk(zap, idx + i, blk, tx);
 		ASSERT0(err); /* we checked for i/o errors above */
 		if (err != 0)
 			break;
 	}
 
 	return (err);
 }
 
 #define	ZAP_PREFIX_HASH(pref, pref_len)	((pref) << (64 - (pref_len)))
 
 /*
  * Each leaf has single range of entries (block pointers) in the ZAP ptrtbl.
  * If two leaves are siblings, their ranges are adjecent and contain the same
  * number of entries. In order to find out if a leaf has a sibling, we need to
  * check the range corresponding to the sibling leaf. There is no need to check
  * all entries in the range, we only need to check the frist and the last one.
  */
 static uint64_t
 check_sibling_ptrtbl_range(zap_t *zap, uint64_t prefix, uint64_t prefix_len)
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	uint64_t h = ZAP_PREFIX_HASH(prefix, prefix_len);
 	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	uint64_t pref_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - prefix_len;
 	uint64_t nptrs = (1 << pref_diff);
 	uint64_t first;
 	uint64_t last;
 
 	ASSERT3U(idx+nptrs, <=, (1UL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 
 	if (zap_idx_to_blk(zap, idx, &first) != 0)
 		return (0);
 
 	if (zap_idx_to_blk(zap, idx + nptrs - 1, &last) != 0)
 		return (0);
 
 	if (first != last)
 		return (0);
 	return (first);
 }
 
 static int
 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
 {
 	uint64_t blk;
 
 	ASSERT(zap->zap_dbuf == NULL ||
 	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
 
 	/* Reality check for corrupt zap objects (leaf or header). */
 	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
 	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
 	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
 		return (SET_ERROR(EIO));
 	}
 
 	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	int err = zap_idx_to_blk(zap, idx, &blk);
 	if (err != 0)
 		return (err);
 	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
 	ASSERT(err ||
 	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
 	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 	return (err);
 }
 
 static int
 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	int err;
 	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
 	if (zap_tryupgradedir(zap, tx) == 0 ||
 	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
 		objset_t *os = zap->zap_objset;
 		uint64_t object = zap->zap_object;
 
 		zap_put_leaf(l);
 		*lp = l = NULL;
 		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
 		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err != 0)
 			return (err);
 		ASSERT(!zap->zap_ismicro);
 
 		while (old_prefix_len ==
 		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 			err = zap_grow_ptrtbl(zap, tx);
 			if (err != 0)
 				return (err);
 		}
 
 		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
 		if (err != 0)
 			return (err);
 
 		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
 			*lp = l;
 			return (0);
 		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
 	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
 	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    (old_prefix_len + 1);
 	uint64_t sibling =
 	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
 	/* check for i/o errors before doing zap_leaf_split */
 	for (int i = 0; i < (1ULL << prefix_diff); i++) {
 		uint64_t blk;
 		err = zap_idx_to_blk(zap, sibling + i, &blk);
 		if (err != 0)
 			return (err);
 		ASSERT3U(blk, ==, l->l_blkid);
 	}
 
 	zap_leaf_t *nl = zap_create_leaf(zap, tx);
 	zap_leaf_split(l, nl, zap->zap_normflags != 0);
 
 	/* set sibling pointers */
 	for (int i = 0; i < (1ULL << prefix_diff); i++) {
 		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
 		ASSERT0(err); /* we checked for i/o errors above */
 	}
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
 
 	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 		/* we want the sibling */
 		zap_put_leaf(l);
 		*lp = nl;
 	} else {
 		zap_put_leaf(nl);
 		*lp = l;
 	}
 
 	return (0);
 }
 
 static void
 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
     const void *tag, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
 	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
 	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
 	zap_put_leaf(l);
 
 	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 		/*
 		 * We are in the middle of growing the pointer table, or
 		 * this leaf will soon make us grow it.
 		 */
 		if (zap_tryupgradedir(zap, tx) == 0) {
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;
 
 			zap_unlockdir(zap, tag);
 			int err = zap_lockdir(os, zapobj, tx,
 			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err != 0)
 				return;
 		}
 
 		/* could have finished growing while our locks were down */
 		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 			(void) zap_grow_ptrtbl(zap, tx);
 	}
 }
 
 static int
 fzap_checkname(zap_name_t *zn)
 {
 	uint32_t maxnamelen = zn->zn_normbuf_len;
 	uint64_t len = (uint64_t)zn->zn_key_orig_numints * zn->zn_key_intlen;
 	/* Only allow directory zap to have longname */
 	if (len > maxnamelen ||
 	    (len > ZAP_MAXNAMELEN &&
 	    zn->zn_zap->zap_dnode->dn_type != DMU_OT_DIRECTORY_CONTENTS))
 		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
 static int
 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
 {
 	/* Only integer sizes supported by C */
 	switch (integer_size) {
 	case 1:
 	case 2:
 	case 4:
 	case 8:
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (integer_size * num_integers > ZAP_MAXVALUELEN)
 		return (SET_ERROR(E2BIG));
 
 	return (0);
 }
 
 static int
 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
 {
 	int err = fzap_checkname(zn);
 	if (err != 0)
 		return (err);
 	return (fzap_checksize(integer_size, num_integers));
 }
 
 /*
  * Routines for manipulating attributes.
  */
 int
 fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *ncp)
 {
 	zap_leaf_t *l;
 	zap_entry_handle_t zeh;
 
 	int err = fzap_checkname(zn);
 	if (err != 0)
 		return (err);
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
 			zap_put_leaf(l);
 			return (err);
 		}
 
 		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
 		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 		if (ncp) {
 			*ncp = zap_entry_normalization_conflict(&zeh,
 			    zn, NULL, zn->zn_zap);
 		}
 	}
 
 	zap_put_leaf(l);
 	return (err);
 }
 
 int
 fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(!zap->zap_ismicro);
 	ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		err = SET_ERROR(EEXIST);
 		goto out;
 	}
 	if (err != ENOENT)
 		goto out;
 
 	err = zap_entry_create(l, zn, cd,
 	    integer_size, num_integers, val, &zeh);
 
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 out:
 	if (l != NULL) {
 		if (err == ENOSPC)
 			zap_put_leaf(l);
 		else
 			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	}
 	return (err);
 }
 
 int
 fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, const void *tag, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
 	    val, ZAP_NEED_CD, tag, tx));
 }
 
 int
 fzap_update(zap_name_t *zn,
     int integer_size, uint64_t num_integers, const void *val,
     const void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
 	boolean_t create;
 	zap_entry_handle_t zeh;
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 retry:
 	err = zap_leaf_lookup(l, zn, &zeh);
 	create = (err == ENOENT);
 	ASSERT(err == 0 || err == ENOENT);
 
 	if (create) {
 		err = zap_entry_create(l, zn, ZAP_NEED_CD,
 		    integer_size, num_integers, val, &zeh);
 		if (err == 0)
 			zap_increment_num_entries(zap, 1, tx);
 	} else {
 		err = zap_entry_update(&zeh, integer_size, num_integers, val);
 	}
 
 	if (err == EAGAIN) {
 		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 	if (l != NULL) {
 		if (err == ENOSPC)
 			zap_put_leaf(l);
 		else
 			zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	}
 	return (err);
 }
 
 int
 fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err != 0)
 		goto out;
 
 	if (integer_size != NULL)
 		*integer_size = zeh.zeh_integer_size;
 	if (num_integers != NULL)
 		*num_integers = zeh.zeh_num_integers;
 out:
 	zap_put_leaf(l);
 	return (err);
 }
 
 int
 fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
 	zap_entry_handle_t zeh;
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
 		zap_entry_remove(&zeh);
 		zap_increment_num_entries(zn->zn_zap, -1, tx);
 
 		if (zap_leaf_phys(l)->l_hdr.lh_nentries == 0 &&
 		    zap_shrink_enabled)
 			return (zap_shrink(zn, l, tx));
 	}
 	zap_put_leaf(l);
 	return (err);
 }
 
 void
 fzap_prefetch(zap_name_t *zn)
 {
 	uint64_t blk;
 	zap_t *zap = zn->zn_zap;
 
 	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
 	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
 /*
  * Helper functions for consumers.
  */
 
 uint64_t
 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, dmu_tx_t *tx)
 {
 	return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
 }
 
 uint64_t
 zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
     const char *name, int dnodesize, dmu_tx_t *tx)
 {
 	uint64_t new_obj;
 
 	new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx);
 	VERIFY(new_obj != 0);
 	VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 	    tx));
 
 	return (new_obj);
 }
 
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name, uint64_t namelen)
 {
 	zap_cursor_t zc;
 	int err;
 
 	if (mask == 0)
 		mask = -1ULL;
 
 	zap_attribute_t *za = zap_attribute_long_alloc();
 	for (zap_cursor_init(&zc, os, zapobj);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((za->za_first_integer & mask) == (value & mask)) {
 			if (strlcpy(name, za->za_name, namelen) >= namelen)
 				err = SET_ERROR(ENAMETOOLONG);
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (err);
 }
 
 int
 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	int err = 0;
 
 	zap_attribute_t *za = zap_attribute_long_alloc();
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 		err = zap_add(os, intoobj, za->za_name,
 		    8, 1, &za->za_first_integer, tx);
 		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (err);
 }
 
 int
 zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     uint64_t value, dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	int err = 0;
 
 	zap_attribute_t *za = zap_attribute_long_alloc();
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 		err = zap_add(os, intoobj, za->za_name,
 		    8, 1, &value, tx);
 		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (err);
 }
 
 int
 zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
     dmu_tx_t *tx)
 {
 	zap_cursor_t zc;
 	int err = 0;
 
 	zap_attribute_t *za = zap_attribute_long_alloc();
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		uint64_t delta = 0;
 
 		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 			err = SET_ERROR(EINVAL);
 			break;
 		}
 
 		err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
 		if (err != 0 && err != ENOENT)
 			break;
 		delta += za->za_first_integer;
 		err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
 		if (err != 0)
 			break;
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 	return (err);
 }
 
 int
 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_remove(os, obj, name, tx));
 }
 
 int
 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
 int
 zap_add_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_add(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_update_int_key(objset_t *os, uint64_t obj,
     uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_update(os, obj, name, 8, 1, &value, tx));
 }
 
 int
 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_lookup(os, obj, name, 8, 1, valuep));
 }
 
 int
 zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
     dmu_tx_t *tx)
 {
 	uint64_t value = 0;
 
 	if (delta == 0)
 		return (0);
 
 	int err = zap_lookup(os, obj, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
 	value += delta;
 	if (value == 0)
 		err = zap_remove(os, obj, name, tx);
 	else
 		err = zap_update(os, obj, name, 8, 1, &value, tx);
 	return (err);
 }
 
 int
 zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
     dmu_tx_t *tx)
 {
 	char name[20];
 
 	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	return (zap_increment(os, obj, name, delta, tx));
 }
 
 /*
  * Routines for iterating over the attributes.
  */
 
 int
 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err = ENOENT;
 	zap_entry_handle_t zeh;
 	zap_leaf_t *l;
 
 	/* retrieve the next entry at or after zc_hash/zc_cd */
 	/* if no entry, return ENOENT */
 
 	/*
 	 * If we are reading from the beginning, we're almost certain to
 	 * iterate over the entire ZAP object.  If there are multiple leaf
 	 * blocks (freeblk > 2), prefetch the whole object (up to
 	 * dmu_prefetch_max bytes), so that we read the leaf blocks
 	 * concurrently. (Unless noprefetch was requested via
 	 * zap_cursor_init_noprefetch()).
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
 		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
 
 	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 
 		/*
 		 * The leaf was either shrunk or split.
 		 */
 		if ((zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_block_type == 0) ||
 		    (ZAP_HASH_IDX(zc->zc_hash,
 		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
 		    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
 			zap_put_leaf(zc->zc_leaf);
 			zc->zc_leaf = NULL;
 		}
 	}
 
 again:
 	if (zc->zc_leaf == NULL) {
 		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
 		    &zc->zc_leaf);
 		if (err != 0)
 			return (err);
 	}
 	l = zc->zc_leaf;
 
 	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
 
 	if (err == ENOENT) {
 		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
 			zc->zc_hash = -1ULL;
 			zc->zc_cd = 0;
 		} else {
 			uint64_t nocare = (1ULL <<
 			    (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
 
 			zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
 			zc->zc_cd = 0;
 
 			if (zc->zc_hash == 0) {
 				zc->zc_hash = -1ULL;
 			} else {
 				zap_put_leaf(zc->zc_leaf);
 				zc->zc_leaf = NULL;
 				goto again;
 			}
 		}
 	}
 
 	if (err == 0) {
 		zc->zc_hash = zeh.zeh_hash;
 		zc->zc_cd = zeh.zeh_cd;
 		za->za_integer_length = zeh.zeh_integer_size;
 		za->za_num_integers = zeh.zeh_num_integers;
 		if (zeh.zeh_num_integers == 0) {
 			za->za_first_integer = 0;
 		} else {
 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
 			ASSERT(err == 0 || err == EOVERFLOW);
 		}
 		err = zap_entry_read_name(zap, &zeh,
 		    za->za_name_len, za->za_name);
 		ASSERT(err == 0);
 
 		za->za_normalization_conflict =
 		    zap_entry_normalization_conflict(&zeh,
 		    NULL, za->za_name, zap);
 	}
 	rw_exit(&zc->zc_leaf->l_rwlock);
 	return (err);
 }
 
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
 	uint64_t lastblk = 0;
 
 	/*
 	 * NB: if a leaf has more pointers than an entire ptrtbl block
 	 * can hold, then it'll be accounted for more than once, since
 	 * we won't have lastblk.
 	 */
 	for (int i = 0; i < len; i++) {
 		zap_leaf_t *l;
 
 		if (tbl[i] == lastblk)
 			continue;
 		lastblk = tbl[i];
 
 		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
 		if (err == 0) {
 			zap_leaf_stats(zap, l, zs);
 			zap_put_leaf(l);
 		}
 	}
 }
 
 void
 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 {
 	int bs = FZAP_BLOCK_SHIFT(zap);
 	zs->zs_blocksize = 1ULL << bs;
 
 	/*
 	 * Set zap_phys_t fields
 	 */
 	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
 	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
 	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
 	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
 	zs->zs_magic = zap_f_phys(zap)->zap_magic;
 	zs->zs_salt = zap_f_phys(zap)->zap_salt;
 
 	/*
 	 * Set zap_ptrtbl fields
 	 */
 	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
 	zs->zs_ptrtbl_blks_copied =
 	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
 	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
 	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/* the ptrtbl is entirely in the header block. */
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
 		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 
 		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
 			int err;
 
 			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
 				zap_stats_ptrtbl(zap, db->db_data,
 				    1<<(bs-3), zs);
 				dmu_buf_rele(db, FTAG);
 			}
 		}
 	}
 }
 
 /*
  * Find last allocated block and update freeblk.
  */
 static void
 zap_trunc(zap_t *zap)
 {
 	uint64_t nentries;
 	uint64_t lastblk;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk > 0) {
 		/* External ptrtbl */
 		nentries = (1 << zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 		lastblk = zap_f_phys(zap)->zap_ptrtbl.zt_blk +
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks - 1;
 	} else {
 		/* Embedded ptrtbl */
 		nentries = (1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		lastblk = 0;
 	}
 
 	for (uint64_t idx = 0; idx < nentries; idx++) {
 		uint64_t blk;
 		if (zap_idx_to_blk(zap, idx, &blk) != 0)
 			return;
 		if (blk > lastblk)
 			lastblk = blk;
 	}
 
 	ASSERT3U(lastblk, <, zap_f_phys(zap)->zap_freeblk);
 
 	zap_f_phys(zap)->zap_freeblk = lastblk + 1;
 }
 
 /*
  * ZAP shrinking algorithm.
  *
  * We shrink ZAP recuresively removing empty leaves. We can remove an empty leaf
  * only if it has a sibling. Sibling leaves have the same prefix length and
  * their prefixes differ only by the least significant (sibling) bit. We require
  * both siblings to be empty. This eliminates a need to rehash the non-empty
  * remaining leaf. When we have removed one of two empty sibling, we set ptrtbl
  * entries of the removed leaf to point out to the remaining leaf. Prefix length
  * of the remaining leaf is decremented. As a result, it has a new prefix and it
  * might have a new sibling. So, we repeat the process.
  *
  * Steps:
  * 1. Check if a sibling leaf (sl) exists and it is empty.
  * 2. Release the leaf (l) if it has the sibling bit (slbit) equal to 1.
  * 3. Release the sibling (sl) to derefer it again with WRITER lock.
  * 4. Upgrade zapdir lock to WRITER (once).
  * 5. Derefer released leaves again.
  * 6. If it is needed, recheck whether both leaves are still siblings and empty.
  * 7. Set ptrtbl pointers of the removed leaf (slbit 1) to point out to blkid of
  * the remaining leaf (slbit 0).
  * 8. Free disk block of the removed leaf (dmu_free_range).
  * 9. Decrement prefix_len of the remaining leaf.
  * 10. Repeat the steps.
  */
 static int
 zap_shrink(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
 	int64_t zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 	uint64_t hash = zn->zn_hash;
 	uint64_t prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
 	uint64_t prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	boolean_t trunc = B_FALSE;
 	int err = 0;
 
 	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
 	ASSERT3U(prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT3U(ZAP_HASH_IDX(hash, prefix_len), ==, prefix);
 
 	boolean_t writer = B_FALSE;
 
 	/*
 	 * To avoid deadlock always deref leaves in the same order -
 	 * sibling 0 first, then sibling 1.
 	 */
 	while (prefix_len) {
 		zap_leaf_t *sl;
 		int64_t prefix_diff = zt_shift - prefix_len;
 		uint64_t sl_prefix = prefix ^ 1;
 		uint64_t sl_hash = ZAP_PREFIX_HASH(sl_prefix, prefix_len);
 		int slbit = prefix & 1;
 
 		ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nentries, ==, 0);
 
 		/*
 		 * Check if there is a sibling by reading ptrtbl ptrs.
 		 */
 		if (check_sibling_ptrtbl_range(zap, sl_prefix, prefix_len) == 0)
 			break;
 
 		/*
 		 * sibling 1, unlock it - we haven't yet dereferenced sibling 0.
 		 */
 		if (slbit == 1) {
 			zap_put_leaf(l);
 			l = NULL;
 		}
 
 		/*
 		 * Dereference sibling leaf and check if it is empty.
 		 */
 		if ((err = zap_deref_leaf(zap, sl_hash, tx, RW_READER,
 		    &sl)) != 0)
 			break;
 
 		ASSERT3U(ZAP_HASH_IDX(sl_hash, prefix_len), ==, sl_prefix);
 
 		/*
 		 * Check if we have a sibling and it is empty.
 		 */
 		if (zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len ||
 		    zap_leaf_phys(sl)->l_hdr.lh_nentries != 0) {
 			zap_put_leaf(sl);
 			break;
 		}
 
 		zap_put_leaf(sl);
 
 		/*
 		 * If there two empty sibling, we have work to do, so
 		 * we need to lock ZAP ptrtbl as WRITER.
 		 */
 		if (!writer && (writer = zap_tryupgradedir(zap, tx)) == 0) {
 			/* We failed to upgrade */
 			if (l != NULL) {
 				zap_put_leaf(l);
 				l = NULL;
 			}
 
 			/*
 			 * Usually, the right way to upgrade from a READER lock
 			 * to a WRITER lock is to call zap_unlockdir() and
 			 * zap_lockdir(), but we do not have a tag. Instead,
 			 * we do it in more sophisticated way.
 			 */
 			rw_exit(&zap->zap_rwlock);
 			rw_enter(&zap->zap_rwlock, RW_WRITER);
 			dmu_buf_will_dirty(zap->zap_dbuf, tx);
 
 			zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 			writer = B_TRUE;
 		}
 
 		/*
 		 * Here we have WRITER lock for ptrtbl.
 		 * Now, we need a WRITER lock for both siblings leaves.
 		 * Also, we have to recheck if the leaves are still siblings
 		 * and still empty.
 		 */
 		if (l == NULL) {
 			/* sibling 0 */
 			if ((err = zap_deref_leaf(zap, (slbit ? sl_hash : hash),
 			    tx, RW_WRITER, &l)) != 0)
 				break;
 
 			/*
 			 * The leaf isn't empty anymore or
 			 * it was shrunk/split while our locks were down.
 			 */
 			if (zap_leaf_phys(l)->l_hdr.lh_nentries != 0 ||
 			    zap_leaf_phys(l)->l_hdr.lh_prefix_len != prefix_len)
 				break;
 		}
 
 		/* sibling 1 */
 		if ((err = zap_deref_leaf(zap, (slbit ? hash : sl_hash), tx,
 		    RW_WRITER, &sl)) != 0)
 			break;
 
 		/*
 		 * The leaf isn't empty anymore or
 		 * it was shrunk/split while our locks were down.
 		 */
 		if (zap_leaf_phys(sl)->l_hdr.lh_nentries != 0 ||
 		    zap_leaf_phys(sl)->l_hdr.lh_prefix_len != prefix_len) {
 			zap_put_leaf(sl);
 			break;
 		}
 
 		/* If we have gotten here, we have a leaf to collapse */
 		uint64_t idx = (slbit ? prefix : sl_prefix) << prefix_diff;
 		uint64_t nptrs = (1ULL << prefix_diff);
 		uint64_t sl_blkid = sl->l_blkid;
 
 		/*
 		 * Set ptrtbl entries to point out to the slibling 0 blkid
 		 */
 		if ((err = zap_set_idx_range_to_blk(zap, idx, nptrs, l->l_blkid,
 		    tx)) != 0) {
 			zap_put_leaf(sl);
 			break;
 		}
 
 		/*
 		 * Free sibling 1 disk block.
 		 */
 		int bs = FZAP_BLOCK_SHIFT(zap);
 		if (sl_blkid == zap_f_phys(zap)->zap_freeblk - 1)
 			trunc = B_TRUE;
 
 		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
 		    sl_blkid << bs, 1 << bs, tx);
 		zap_put_leaf(sl);
 
 		zap_f_phys(zap)->zap_num_leafs--;
 
 		/*
 		 * Update prefix and prefix_len.
 		 */
 		zap_leaf_phys(l)->l_hdr.lh_prefix >>= 1;
 		zap_leaf_phys(l)->l_hdr.lh_prefix_len--;
 
 		prefix = zap_leaf_phys(l)->l_hdr.lh_prefix;
 		prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	}
 
 	if (trunc)
 		zap_trunc(zap);
 
 	if (l != NULL)
 		zap_put_leaf(l);
 
 	return (err);
 }
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
 	"When iterating ZAP object, prefetch it");
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zap_shrink_enabled, INT, ZMOD_RW,
 	"Enable ZAP shrinking");
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index dfe309aa551f..55b60006e58c 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -1,2036 +1,2035 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/btree.h>
 #include <sys/arc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_impl.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
 /*
  * The maximum size (in bytes) of a microzap before it is converted to a
  * fatzap. It will be rounded up to next multiple of 512 (SPA_MINBLOCKSIZE).
  *
  * By definition, a microzap must fit into a single block, so this has
  * traditionally been SPA_OLD_MAXBLOCKSIZE, and is set to that by default.
  * Setting this higher requires both the large_blocks feature (to even create
  * blocks that large) and the large_microzap feature (to enable the stream
  * machinery to understand not to try to split a microzap block).
  *
  * If large_microzap is enabled, this value will be clamped to
  * spa_maxblocksize(). If not, it will be clamped to SPA_OLD_MAXBLOCKSIZE.
  */
 static int zap_micro_max_size = SPA_OLD_MAXBLOCKSIZE;
 
 uint64_t
 zap_get_micro_max_size(spa_t *spa)
 {
 	uint64_t maxsz = P2ROUNDUP(zap_micro_max_size, SPA_MINBLOCKSIZE);
 	if (maxsz <= SPA_OLD_MAXBLOCKSIZE)
 		return (maxsz);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_MICROZAP))
 		return (MIN(maxsz, spa_maxblocksize(spa)));
 	return (SPA_OLD_MAXBLOCKSIZE);
 }
 
 static int mzap_upgrade(zap_t **zapp,
     const void *tag, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
 zap_getflags(zap_t *zap)
 {
 	if (zap->zap_ismicro)
 		return (0);
 	return (zap_f_phys(zap)->zap_flags);
 }
 
 int
 zap_hashbits(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return (48);
 	else
 		return (28);
 }
 
 uint32_t
 zap_maxcd(zap_t *zap)
 {
 	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
 		return ((1<<16)-1);
 	else
 		return (-1U);
 }
 
 static uint64_t
 zap_hash(zap_name_t *zn)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t h = 0;
 
 	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
 		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
 		h = *(uint64_t *)zn->zn_key_orig;
 	} else {
 		h = zap->zap_salt;
 		ASSERT(h != 0);
 		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 
 		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
 			const uint64_t *wp = zn->zn_key_norm;
 
 			ASSERT(zn->zn_key_intlen == 8);
 			for (int i = 0; i < zn->zn_key_norm_numints;
 			    wp++, i++) {
 				uint64_t word = *wp;
 
 				for (int j = 0; j < 8; j++) {
 					h = (h >> 8) ^
 					    zfs_crc64_table[(h ^ word) & 0xFF];
 					word >>= NBBY;
 				}
 			}
 		} else {
 			const uint8_t *cp = zn->zn_key_norm;
 
 			/*
 			 * We previously stored the terminating null on
 			 * disk, but didn't hash it, so we need to
 			 * continue to not hash it.  (The
 			 * zn_key_*_numints includes the terminating
 			 * null for non-binary keys.)
 			 */
 			int len = zn->zn_key_norm_numints - 1;
 
 			ASSERT(zn->zn_key_intlen == 1);
 			for (int i = 0; i < len; cp++, i++) {
 				h = (h >> 8) ^
 				    zfs_crc64_table[(h ^ *cp) & 0xFF];
 			}
 		}
 	}
 	/*
 	 * Don't use all 64 bits, since we need some in the cookie for
 	 * the collision differentiator.  We MUST use the high bits,
 	 * since those are the ones that we first pay attention to when
 	 * choosing the bucket.
 	 */
 	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
 	return (h);
 }
 
 static int
 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags,
     size_t outlen)
 {
 	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
 
 	size_t inlen = strlen(name) + 1;
 
 	int err = 0;
 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
 	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
 	    U8_UNICODE_LATEST, &err);
 
 	return (err);
 }
 
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
 	boolean_t res = B_FALSE;
 	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
 
 	if (zn->zn_matchtype & MT_NORMALIZE) {
 		size_t namelen = zn->zn_normbuf_len;
 		char normbuf[ZAP_MAXNAMELEN];
 		char *norm = normbuf;
 
 		/*
 		 * Cannot allocate this on-stack as it exceed the stack-limit of
 		 * 1024.
 		 */
 		if (namelen > ZAP_MAXNAMELEN)
 			norm = kmem_alloc(namelen, KM_SLEEP);
 
 		if (zap_normalize(zn->zn_zap, matchname, norm,
 		    zn->zn_normflags, namelen) != 0) {
 			res = B_FALSE;
 		} else {
 			res = (strcmp(zn->zn_key_norm, norm) == 0);
 		}
 		if (norm != normbuf)
 			kmem_free(norm, namelen);
 	} else {
 		res = (strcmp(zn->zn_key_orig, matchname) == 0);
 	}
 	return (res);
 }
 
 static kmem_cache_t *zap_name_cache;
 static kmem_cache_t *zap_attr_cache;
 static kmem_cache_t *zap_name_long_cache;
 static kmem_cache_t *zap_attr_long_cache;
 
 void
 zap_init(void)
 {
 	zap_name_cache = kmem_cache_create("zap_name",
 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN, 0, NULL, NULL,
 	    NULL, NULL, NULL, 0);
 
 	zap_attr_cache = kmem_cache_create("zap_attr_cache",
 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN,  0, NULL,
 	    NULL, NULL, NULL, NULL, 0);
 
 	zap_name_long_cache = kmem_cache_create("zap_name_long",
 	    sizeof (zap_name_t) + ZAP_MAXNAMELEN_NEW, 0, NULL, NULL,
 	    NULL, NULL, NULL, 0);
 
 	zap_attr_long_cache = kmem_cache_create("zap_attr_long_cache",
 	    sizeof (zap_attribute_t) + ZAP_MAXNAMELEN_NEW,  0, NULL,
 	    NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zap_fini(void)
 {
 	kmem_cache_destroy(zap_name_cache);
 	kmem_cache_destroy(zap_attr_cache);
 	kmem_cache_destroy(zap_name_long_cache);
 	kmem_cache_destroy(zap_attr_long_cache);
 }
 
 static zap_name_t *
 zap_name_alloc(zap_t *zap, boolean_t longname)
 {
 	kmem_cache_t *cache = longname ? zap_name_long_cache : zap_name_cache;
 	zap_name_t *zn = kmem_cache_alloc(cache, KM_SLEEP);
 
 	zn->zn_zap = zap;
 	zn->zn_normbuf_len = longname ? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
 	return (zn);
 }
 
 void
 zap_name_free(zap_name_t *zn)
 {
 	if (zn->zn_normbuf_len == ZAP_MAXNAMELEN) {
 		kmem_cache_free(zap_name_cache, zn);
 	} else {
 		ASSERT3U(zn->zn_normbuf_len, ==, ZAP_MAXNAMELEN_NEW);
 		kmem_cache_free(zap_name_long_cache, zn);
 	}
 }
 
 static int
 zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
 {
 	zap_t *zap = zn->zn_zap;
 	size_t key_len = strlen(key) + 1;
 
 	/* Make sure zn is allocated for longname if key is long */
 	IMPLY(key_len > ZAP_MAXNAMELEN,
 	    zn->zn_normbuf_len == ZAP_MAXNAMELEN_NEW);
 
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = key;
 	zn->zn_key_orig_numints = key_len;
 	zn->zn_matchtype = mt;
 	zn->zn_normflags = zap->zap_normflags;
 
 	/*
 	 * If we're dealing with a case sensitive lookup on a mixed or
 	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
 	 * will fold case to all caps overriding the lookup request.
 	 */
 	if (mt & MT_MATCH_CASE)
 		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
 
 	if (zap->zap_normflags) {
 		/*
 		 * We *must* use zap_normflags because this normalization is
 		 * what the hash is computed from.
 		 */
 		if (zap_normalize(zap, key, zn->zn_normbuf,
 		    zap->zap_normflags, zn->zn_normbuf_len) != 0)
 			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm = zn->zn_normbuf;
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	} else {
 		if (mt != 0)
 			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm = zn->zn_key_orig;
 		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 	}
 
 	zn->zn_hash = zap_hash(zn);
 
 	if (zap->zap_normflags != zn->zn_normflags) {
 		/*
 		 * We *must* use zn_normflags because this normalization is
 		 * what the matching is based on.  (Not the hash!)
 		 */
 		if (zap_normalize(zap, key, zn->zn_normbuf,
 		    zn->zn_normflags, zn->zn_normbuf_len) != 0)
 			return (SET_ERROR(ENOTSUP));
 		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	}
 
 	return (0);
 }
 
 zap_name_t *
 zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
 {
 	size_t key_len = strlen(key) + 1;
 	zap_name_t *zn = zap_name_alloc(zap, (key_len > ZAP_MAXNAMELEN));
 	if (zap_name_init_str(zn, key, mt) != 0) {
 		zap_name_free(zn);
 		return (NULL);
 	}
 	return (zn);
 }
 
 static zap_name_t *
 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
 {
 	zap_name_t *zn = kmem_cache_alloc(zap_name_cache, KM_SLEEP);
 
 	ASSERT(zap->zap_normflags == 0);
 	zn->zn_zap = zap;
 	zn->zn_key_intlen = sizeof (*key);
 	zn->zn_key_orig = zn->zn_key_norm = key;
 	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
 	zn->zn_matchtype = 0;
 	zn->zn_normbuf_len = ZAP_MAXNAMELEN;
 
 	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
 static void
 mzap_byteswap(mzap_phys_t *buf, size_t size)
 {
 	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
 	buf->mz_salt = BSWAP_64(buf->mz_salt);
 	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
 	int max = (size / MZAP_ENT_LEN) - 1;
 	for (int i = 0; i < max; i++) {
 		buf->mz_chunk[i].mze_value =
 		    BSWAP_64(buf->mz_chunk[i].mze_value);
 		buf->mz_chunk[i].mze_cd =
 		    BSWAP_32(buf->mz_chunk[i].mze_cd);
 	}
 }
 
 void
 zap_byteswap(void *buf, size_t size)
 {
 	uint64_t block_type = *(uint64_t *)buf;
 
 	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
 		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
 		mzap_byteswap(buf, size);
 	} else {
 		fzap_byteswap(buf, size);
 	}
 }
 
 __attribute__((always_inline)) inline
 static int
 mze_compare(const void *arg1, const void *arg2)
 {
 	const mzap_ent_t *mze1 = arg1;
 	const mzap_ent_t *mze2 = arg2;
 
 	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
 	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
 }
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
     mze_compare)
 
 static void
 mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
 {
 	mzap_ent_t mze;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	mze.mze_chunkid = chunkid;
 	ASSERT0(hash & 0xffffffff);
 	mze.mze_hash = hash >> 32;
 	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
 	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
 	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
 	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
 }
 
 static mzap_ent_t *
 mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
 {
 	mzap_ent_t mze_tofind;
 	mzap_ent_t *mze;
 	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
 
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
 	ASSERT0(zn->zn_hash & 0xffffffff);
 	mze_tofind.mze_hash = zn->zn_hash >> 32;
 	mze_tofind.mze_cd = 0;
 
 	mze = zfs_btree_find(tree, &mze_tofind, idx);
 	if (mze == NULL)
 		mze = zfs_btree_next(tree, idx, idx);
 	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
 	    mze = zfs_btree_next(tree, idx, idx)) {
 		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
 		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
 	}
 
 	return (NULL);
 }
 
 static uint32_t
 mze_find_unused_cd(zap_t *zap, uint64_t hash)
 {
 	mzap_ent_t mze_tofind;
 	zfs_btree_index_t idx;
 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	ASSERT0(hash & 0xffffffff);
 	hash >>= 32;
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
 	uint32_t cd = 0;
 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash;
 	    mze = zfs_btree_next(tree, &idx, &idx)) {
 		if (mze->mze_cd != cd)
 			break;
 		cd++;
 	}
 
 	return (cd);
 }
 
 /*
  * Each mzap entry requires at max : 4 chunks
  * 3 chunks for names + 1 chunk for value.
  */
 #define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
 	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
 
 /*
  * Check if the current entry keeps the colliding entries under the fatzap leaf
  * size.
  */
 static boolean_t
 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
 {
 	zap_t *zap = zn->zn_zap;
 	mzap_ent_t mze_tofind;
 	zfs_btree_index_t idx;
 	zfs_btree_t *tree = &zap->zap_m.zap_tree;
 	uint32_t mzap_ents = 0;
 
 	ASSERT0(hash & 0xffffffff);
 	hash >>= 32;
 	mze_tofind.mze_hash = hash;
 	mze_tofind.mze_cd = 0;
 
 	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash;
 	    mze = zfs_btree_next(tree, &idx, &idx)) {
 		mzap_ents++;
 	}
 
 	/* Include the new entry being added */
 	mzap_ents++;
 
 	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
 }
 
 static void
 mze_destroy(zap_t *zap)
 {
 	zfs_btree_clear(&zap->zap_m.zap_tree);
 	zfs_btree_destroy(&zap->zap_m.zap_tree);
 }
 
 static zap_t *
 mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
 	uint64_t zap_block_type = zap_hdr[0];
 	uint64_t zap_magic = zap_hdr[1];
 
 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
 	zap->zap_objset = dmu_buf_get_objset(db);
 	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
 		    0);
 		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
 		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
 			winner = NULL;	/* No actual winner here... */
 			goto handle_winner;
 		}
 	} else {
 		zap->zap_ismicro = TRUE;
 	}
 
 	/*
 	 * Make sure that zap_ismicro is set before we let others see
 	 * it, because zap_lockdir() checks zap_ismicro without the lock
 	 * held.
 	 */
 	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
 	winner = dmu_buf_set_user(db, &zap->zap_dbu);
 
 	if (winner != NULL)
 		goto handle_winner;
 
 	if (zap->zap_ismicro) {
 		zap->zap_salt = zap_m_phys(zap)->mz_salt;
 		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 
 		/*
 		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
 		 * overhead on massive inserts below.  It still allows to store
 		 * 62 entries before we have to add 2KB B-tree core node.
 		 */
 		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
 		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
 
 		zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
 		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
 			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
 				zap->zap_m.zap_num_entries++;
 				zap_name_init_str(zn, mze->mze_name, 0);
 				mze_insert(zap, i, zn->zn_hash);
 			}
 		}
 		zap_name_free(zn);
 	} else {
 		zap->zap_salt = zap_f_phys(zap)->zap_salt;
 		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
 
 		/*
 		 * The embedded pointer table should not overlap the
 		 * other members.
 		 */
 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
 		    &zap_f_phys(zap)->zap_salt);
 
 		/*
 		 * The embedded pointer table should end at the end of
 		 * the block
 		 */
 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
 		    (uintptr_t)zap_f_phys(zap), ==,
 		    zap->zap_dbuf->db_size);
 	}
 	rw_exit(&zap->zap_rwlock);
 	return (zap);
 
 handle_winner:
 	rw_exit(&zap->zap_rwlock);
 	rw_destroy(&zap->zap_rwlock);
 	if (!zap->zap_ismicro)
 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 	kmem_free(zap, sizeof (zap_t));
 	return (winner);
 }
 
 /*
  * This routine "consumes" the caller's hold on the dbuf, which must
  * have the specified tag.
  */
 static int
 zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
 	objset_t *os = dmu_buf_get_objset(db);
 	uint64_t obj = db->db_object;
 	dmu_object_info_t doi;
 
 	*zapp = NULL;
 
 	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
 		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
 			 * Check for corruption!
 			 */
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	/*
 	 * We're checking zap_ismicro without the lock held, in order to
 	 * tell what type of lock we want.  Once we have some sort of
 	 * lock, see if it really is the right type.  In practice this
 	 * can only be different if it was upgraded from micro to fat,
 	 * and micro wanted WRITER but fat only needs READER.
 	 */
 	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
 	rw_enter(&zap->zap_rwlock, lt);
 	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
 		/* it was upgraded, now we only need reader */
 		ASSERT(lt == RW_WRITER);
 		ASSERT(RW_READER ==
 		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
 		rw_downgrade(&zap->zap_rwlock);
 		lt = RW_READER;
 	}
 
 	zap->zap_objset = os;
 	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 
 	ASSERT3P(zap->zap_dbuf, ==, db);
 
 	ASSERT(!zap->zap_ismicro ||
 	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
 	if (zap->zap_ismicro && tx && adding &&
 	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
 		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
 		if (newsz > zap_get_micro_max_size(dmu_objset_spa(os))) {
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
 			int err = mzap_upgrade(zapp, tag, tx, 0);
 			if (err != 0)
 				rw_exit(&zap->zap_rwlock);
 			return (err);
 		}
 		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 
 		if (newsz > SPA_OLD_MAXBLOCKSIZE) {
 			dsl_dataset_t *ds = dmu_objset_ds(os);
 			if (!dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_LARGE_MICROZAP)) {
 				/*
 				 * A microzap just grew beyond the old limit
 				 * for the first time, so we have to ensure the
 				 * feature flag is activated.
 				 * zap_get_micro_max_size() won't let us get
 				 * here if the feature is not enabled, so we
 				 * don't need any other checks beforehand.
 				 *
 				 * Since we're in open context, we can't
 				 * activate the feature directly, so we instead
 				 * flag it on the dataset for next sync.
 				 */
 				dsl_dataset_dirty(ds, tx);
 				mutex_enter(&ds->ds_lock);
 				ds->ds_feature_activation
 				    [SPA_FEATURE_LARGE_MICROZAP] =
 				    (void *)B_TRUE;
 				mutex_exit(&ds->ds_lock);
 			}
 		}
 	}
 
 	*zapp = zap;
 	return (0);
 }
 
 static int
 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
 	dmu_buf_t *db;
 	int err;
 
 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0)
 		dmu_buf_rele(db, tag);
 	else
 		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
 int
 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
 	dnode_t *dn;
 	dmu_buf_t *db;
 	int err;
 
 	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0) {
 		dnode_rele(dn, tag);
 		return (err);
 	}
 	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
 	if (err != 0) {
 		dmu_buf_rele(db, tag);
 		dnode_rele(dn, tag);
 	}
 	return (err);
 }
 
 void
 zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
 	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
 static int
 mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	int sz = zap->zap_dbuf->db_size;
 	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
 	memcpy(mzp, zap->zap_dbuf->db_data, sz);
 	int nchunks = zap->zap_m.zap_num_chunks;
 
 	if (!flags) {
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err != 0) {
 			vmem_free(mzp, sz);
 			return (err);
 		}
 	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
 	    (u_longlong_t)zap->zap_object, nchunks);
 	/* XXX destroy the tree later, so we can use the stored hash value */
 	mze_destroy(zap);
 
 	fzap_upgrade(zap, tx, flags);
 
 	zap_name_t *zn = zap_name_alloc(zap, B_FALSE);
 	for (int i = 0; i < nchunks; i++) {
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		if (mze->mze_name[0] == 0)
 			continue;
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, (u_longlong_t)mze->mze_value);
 		zap_name_init_str(zn, mze->mze_name, 0);
 		/* If we fail here, we would end up losing entries */
 		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
 		    tag, tx));
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 	}
 	zap_name_free(zn);
 	vmem_free(mzp, sz);
 	*zapp = zap;
 	return (0);
 }
 
 /*
  * The "normflags" determine the behavior of the matchtype_t which is
  * passed to zap_lookup_norm().  Names which have the same normalized
  * version will be stored with the same hash value, and therefore we can
  * perform normalization-insensitive lookups.  We can be Unicode form-
  * insensitive and/or case-insensitive.  The following flags are valid for
  * "normflags":
  *
  * U8_TEXTPREP_NFC
  * U8_TEXTPREP_NFD
  * U8_TEXTPREP_NFKC
  * U8_TEXTPREP_NFKD
  * U8_TEXTPREP_TOUPPER
  *
  * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
  * of them may be supplied.
  */
 void
 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 	dmu_buf_will_dirty(db, tx);
 	mzap_phys_t *zp = db->db_data;
 	zp->mz_block_type = ZBT_MICRO;
 	zp->mz_salt =
 	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
 
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(dnode_add_ref(dn, FTAG));
 		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
 	} else {
 		dmu_buf_rele(db, FTAG);
 	}
 }
 
 static uint64_t
 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	uint64_t obj;
 
 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
 
 	if (allocated_dnode == NULL) {
 		dnode_t *dn;
 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
 		    &dn, FTAG, tx);
 		mzap_create_impl(dn, normflags, flags, tx);
 		dnode_rele(dn, FTAG);
 	} else {
 		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
 		    indirect_blockshift, bonustype, bonuslen, dnodesize,
 		    allocated_dnode, tag, tx);
 		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
 	}
 
 	return (obj);
 }
 
 int
 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
 	    0, tx));
 }
 
 int
 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
 	return (zap_create_claim_norm_dnsize(os, obj,
 	    0, ot, bonustype, bonuslen, dnodesize, tx));
 }
 
 int
 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
 	    bonuslen, 0, tx));
 }
 
 int
 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
     dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
     int dnodesize, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
 	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
 	    dnodesize, tx);
 	if (error != 0)
 		return (error);
 
 	error = dnode_hold(os, obj, FTAG, &dn);
 	if (error != 0)
 		return (error);
 
 	mzap_create_impl(dn, normflags, 0, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (0);
 }
 
 uint64_t
 zap_create(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
 }
 
 uint64_t
 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
 	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
 	    dnodesize, tx));
 }
 
 uint64_t
 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
 	    0, tx));
 }
 
 uint64_t
 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
 	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
 	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
 }
 
 uint64_t
 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	return (zap_create_flags_dnsize(os, normflags, flags, ot,
 	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
 }
 
 uint64_t
 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
 {
 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
 	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
 	    tx));
 }
 
 /*
  * Create a zap object and return a pointer to the newly allocated dnode via
  * the allocated_dnode argument.  The returned dnode will be held and the
  * caller is responsible for releasing the hold by calling dnode_rele().
  */
 uint64_t
 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, int dnodesize,
     dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
 {
 	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
 	    indirect_blockshift, bonustype, bonuslen, dnodesize,
 	    allocated_dnode, tag, tx));
 }
 
 int
 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
 {
 	/*
 	 * dmu_object_free will free the object number and free the
 	 * data.  Freeing the data will cause our pageout function to be
 	 * called, which will destroy our data (zap_leaf_t's and zap_t).
 	 */
 
 	return (dmu_object_free(os, zapobj, tx));
 }
 
 void
 zap_evict_sync(void *dbu)
 {
 	zap_t *zap = dbu;
 
 	rw_destroy(&zap->zap_rwlock);
 
 	if (zap->zap_ismicro)
 		mze_destroy(zap);
 	else
 		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
 
 	kmem_free(zap, sizeof (zap_t));
 }
 
 int
 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	if (!zap->zap_ismicro) {
 		err = fzap_count(zap, count);
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 /*
  * zn may be NULL; if not specified, it will be computed if needed.
  * See also the comment above zap_entry_normalization_conflict().
  */
 static boolean_t
 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
     zfs_btree_index_t *idx)
 {
 	boolean_t allocdzn = B_FALSE;
 	mzap_ent_t *other;
 	zfs_btree_index_t oidx;
 
 	if (zap->zap_normflags == 0)
 		return (B_FALSE);
 
 	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
 	    other && other->mze_hash == mze->mze_hash;
 	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
 
 		if (zn == NULL) {
 			zn = zap_name_alloc_str(zap,
 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
 			allocdzn = B_TRUE;
 		}
 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 
 	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
 	    other && other->mze_hash == mze->mze_hash;
 	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
 
 		if (zn == NULL) {
 			zn = zap_name_alloc_str(zap,
 			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
 			allocdzn = B_TRUE;
 		}
 		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
 		}
 	}
 
 	if (allocdzn)
 		zap_name_free(zn);
 	return (B_FALSE);
 }
 
 /*
  * Routines for manipulating attributes.
  */
 
 int
 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	return (zap_lookup_norm(os, zapobj, name, integer_size,
 	    num_integers, buf, 0, NULL, 0, NULL));
 }
 
 static int
 zap_lookup_impl(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
 	int err = 0;
 
 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
 		    realname, rn_len, ncp);
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (num_integers < 1) {
 				err = SET_ERROR(EOVERFLOW);
 			} else if (integer_size != 8) {
 				err = SET_ERROR(EINVAL);
 			} else {
 				*(uint64_t *)buf =
 				    MZE_PHYS(zap, mze)->mze_value;
 				if (realname != NULL)
 					(void) strlcpy(realname,
 					    MZE_PHYS(zap, mze)->mze_name,
 					    rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
 					    zn, mze, &idx);
 				}
 			}
 		}
 	}
 	zap_name_free(zn);
 	return (err);
 }
 
 int
 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_impl(zap, name, integer_size,
 	    num_integers, buf, mt, realname, rn_len, ncp);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
 {
 	zap_t *zap;
 	int err;
 	zap_name_t *zn;
 
 	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_prefetch_object(objset_t *os, uint64_t zapobj)
 {
 	int error;
 	dmu_object_info_t doi;
 
 	error = dmu_object_info(os, zapobj, &doi);
 	if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		error = SET_ERROR(EINVAL);
 	if (error == 0)
 		dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset);
 
 	return (error);
 }
 
 int
 zap_lookup_by_dnode(dnode_t *dn, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
 	    num_integers, buf, 0, NULL, 0, NULL));
 }
 
 int
 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
 	zap_t *zap;
 
 	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
 	    FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_impl(zap, name, integer_size,
 	    num_integers, buf, mt, realname, rn_len, ncp);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 static int
 zap_prefetch_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints)
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	fzap_prefetch(zn);
 	zap_name_free(zn);
 	zap_unlockdir(zap, FTAG);
 	return (0);
 }
 
 int
 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_prefetch_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_prefetch_uint64_impl(zap, key, key_numints);
 	/* zap_prefetch_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 static int
 zap_lookup_uint64_impl(zap_t *zap, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	int err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
 	    num_integers, buf);
 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_lookup_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_lookup_uint64_impl(zap, key, key_numints, integer_size,
 	    num_integers, buf);
 	/* zap_lookup_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
 	int err = zap_lookup_norm(os, zapobj, name, 0,
 	    0, NULL, 0, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
 }
 
 int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			if (integer_size)
 				*integer_size = 8;
 			if (num_integers)
 				*num_integers = 1;
 		}
 	}
 	zap_name_free(zn);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t *integer_size, uint64_t *num_integers)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
 	zap_t *zap = zn->zn_zap;
 	uint16_t start = zap->zap_m.zap_alloc_next;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
 	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
 	ASSERT(cd < zap_maxcd(zap));
 
 again:
 	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
 			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
 			    sizeof (mze->mze_name));
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
 			mze_insert(zap, i, zn->zn_hash);
 			return;
 		}
 	}
 	if (start != 0) {
 		start = 0;
 		goto again;
 	}
 	cmn_err(CE_PANIC, "out of entries!");
 }
 
 static int
 zap_add_impl(zap_t *zap, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx, const void *tag)
 {
 	const uint64_t *intval = val;
 	int err = 0;
 
 	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN ||
 	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
 		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
 		if (err == 0) {
 			err = fzap_add(zn, integer_size, num_integers, val,
 			    tag, tx);
 		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		if (mze_find(zn, &idx) != NULL) {
 			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap, tag);
 	return (err);
 }
 
 int
 zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
 	/* zap_add_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_add_by_dnode(dnode_t *dn, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
 	/* zap_add_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 static int
 zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx, const void *tag)
 {
 	int err;
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
 		zap_unlockdir(zap, tag);
 	return (err);
 }
 
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
 	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_add_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
 	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	const uint64_t *intval = val;
 
 	int err =
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
 	if (zn == NULL) {
 		zap_unlockdir(zap, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_update(zn, integer_size, num_integers, val,
 		    FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    (u_longlong_t)zapobj, integer_size,
 		    (u_longlong_t)num_integers, name);
 		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
 		if (err == 0) {
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, FTAG, tx);
 		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze != NULL) {
 			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
 			mzap_addent(zn, *intval);
 		}
 	}
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 static int
 zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
     const void *tag)
 {
 	int err;
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
 		zap_unlockdir(zap, tag);
 	return (err);
 }
 
 int
 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers, const void *val,
     dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_update_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
 	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_update_uint64_impl(zap, key, key_numints,
 	    integer_size, num_integers, val, tx, FTAG);
 	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
 	return (zap_remove_norm(os, zapobj, name, 0, tx));
 }
 
 static int
 zap_remove_impl(zap_t *zap, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
 	int err = 0;
 
 	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
 	if (zn == NULL)
 		return (SET_ERROR(ENOTSUP));
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t *mze = mze_find(zn, &idx);
 		if (mze == NULL) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
 			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
 			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
 		}
 	}
 	zap_name_free(zn);
 	return (err);
 }
 
 int
 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	err = zap_remove_impl(zap, name, mt, tx);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 {
 	zap_t *zap;
 	int err;
 
 	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	err = zap_remove_impl(zap, name, 0, tx);
 	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 static int
 zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
     dmu_tx_t *tx, const void *tag)
 {
 	int err;
 
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
 		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
 	zap_unlockdir(zap, tag);
 	return (err);
 }
 
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 int
 zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
     dmu_tx_t *tx)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
 	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
 
 static zap_attribute_t *
 zap_attribute_alloc_impl(boolean_t longname)
 {
 	zap_attribute_t *za;
 
 	za = kmem_cache_alloc((longname)? zap_attr_long_cache : zap_attr_cache,
 	    KM_SLEEP);
 	za->za_name_len = (longname)? ZAP_MAXNAMELEN_NEW : ZAP_MAXNAMELEN;
 	return (za);
 }
 
 zap_attribute_t *
 zap_attribute_alloc(void)
 {
 	return (zap_attribute_alloc_impl(B_FALSE));
 }
 
 zap_attribute_t *
 zap_attribute_long_alloc(void)
 {
 	return (zap_attribute_alloc_impl(B_TRUE));
 }
 
 void
 zap_attribute_free(zap_attribute_t *za)
 {
 	if (za->za_name_len == ZAP_MAXNAMELEN) {
 		kmem_cache_free(zap_attr_cache, za);
 	} else {
 		ASSERT3U(za->za_name_len, ==, ZAP_MAXNAMELEN_NEW);
 		kmem_cache_free(zap_attr_long_cache, za);
 	}
 }
 
 /*
  * Routines for iterating over the attributes.
  */
 
 static void
 zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized, boolean_t prefetch)
 {
 	zc->zc_objset = os;
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
 	zc->zc_serialized = serialized;
 	zc->zc_hash = 0;
 	zc->zc_cd = 0;
 	zc->zc_prefetch = prefetch;
 }
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
 {
 	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
 }
 
 /*
  * Initialize a cursor at the beginning of the ZAP object.  The entire
  * ZAP object will be prefetched.
  */
 void
 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
 }
 
 /*
  * Initialize a cursor at the beginning, but request that we not prefetch
  * the entire ZAP object.
  */
 void
 zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
 {
 	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
 }
 
 void
 zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 		zap_unlockdir(zc->zc_zap, NULL);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
 	}
 	zc->zc_objset = NULL;
 }
 
 uint64_t
 zap_cursor_serialize(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return (-1ULL);
 	if (zc->zc_zap == NULL)
 		return (zc->zc_serialized);
 	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
 	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
 
 	/*
 	 * We want to keep the high 32 bits of the cursor zero if we can, so
 	 * that 32-bit programs can access this.  So usually use a small
 	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
 	 * of the cursor.
 	 *
 	 * [ collision differentiator | zap_hashbits()-bit hash value ]
 	 */
 	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
 	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 {
 	int err;
 
 	if (zc->zc_hash == -1ULL)
 		return (SET_ERROR(ENOENT));
 
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
 		if (err != 0)
 			return (err);
 
 		/*
 		 * To support zap_cursor_init_serialized, advance, retrieve,
 		 * we must add to the existing zc_cd, which may already
 		 * be 1 due to the zap_cursor_advance.
 		 */
 		ASSERT(zc->zc_hash == 0);
 		hb = zap_hashbits(zc->zc_zap);
 		zc->zc_hash = zc->zc_serialized << (64 - hb);
 		zc->zc_cd += zc->zc_serialized >> hb;
 		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
 			zc->zc_cd = 0;
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
 		zfs_btree_index_t idx;
 		mzap_ent_t mze_tofind;
 
 		mze_tofind.mze_hash = zc->zc_hash >> 32;
 		mze_tofind.mze_cd = zc->zc_cd;
 
 		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
 		    &mze_tofind, &idx);
 		if (mze == NULL) {
 			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
 			    &idx, &idx);
 		}
 		if (mze) {
 			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
 			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
 			    mzap_normalization_conflict(zc->zc_zap, NULL,
 			    mze, &idx);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
 			za->za_first_integer = mzep->mze_value;
 			(void) strlcpy(za->za_name, mzep->mze_name,
 			    za->za_name_len);
 			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
 			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
 			zc->zc_hash = -1ULL;
 			err = SET_ERROR(ENOENT);
 		}
 	}
 	rw_exit(&zc->zc_zap->zap_rwlock);
 	return (err);
 }
 
 void
 zap_cursor_advance(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return;
 	zc->zc_cd++;
 }
 
 int
 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 {
 	zap_t *zap;
 
 	int err =
 	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
 
 	memset(zs, 0, sizeof (zap_stats_t));
 
 	if (zap->zap_ismicro) {
 		zs->zs_blocksize = zap->zap_dbuf->db_size;
 		zs->zs_num_entries = zap->zap_m.zap_num_entries;
 		zs->zs_num_blocks = 1;
 	} else {
 		fzap_get_stats(zap, zs);
 	}
 	zap_unlockdir(zap, FTAG);
 	return (0);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(zap_create);
 EXPORT_SYMBOL(zap_create_dnsize);
 EXPORT_SYMBOL(zap_create_norm);
 EXPORT_SYMBOL(zap_create_norm_dnsize);
 EXPORT_SYMBOL(zap_create_flags);
 EXPORT_SYMBOL(zap_create_flags_dnsize);
 EXPORT_SYMBOL(zap_create_claim);
 EXPORT_SYMBOL(zap_create_claim_norm);
 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
 EXPORT_SYMBOL(zap_create_hold);
 EXPORT_SYMBOL(zap_destroy);
 EXPORT_SYMBOL(zap_lookup);
 EXPORT_SYMBOL(zap_lookup_by_dnode);
 EXPORT_SYMBOL(zap_lookup_norm);
 EXPORT_SYMBOL(zap_lookup_uint64);
 EXPORT_SYMBOL(zap_contains);
 EXPORT_SYMBOL(zap_prefetch);
 EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_prefetch_object);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
 EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
 EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
 EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);
 EXPORT_SYMBOL(zap_join_increment);
 EXPORT_SYMBOL(zap_add_int);
 EXPORT_SYMBOL(zap_remove_int);
 EXPORT_SYMBOL(zap_lookup_int);
 EXPORT_SYMBOL(zap_increment_int);
 EXPORT_SYMBOL(zap_add_int_key);
 EXPORT_SYMBOL(zap_lookup_int_key);
 EXPORT_SYMBOL(zap_increment);
 EXPORT_SYMBOL(zap_cursor_init);
 EXPORT_SYMBOL(zap_cursor_fini);
 EXPORT_SYMBOL(zap_cursor_retrieve);
 EXPORT_SYMBOL(zap_cursor_advance);
 EXPORT_SYMBOL(zap_cursor_serialize);
 EXPORT_SYMBOL(zap_cursor_init_serialized);
 EXPORT_SYMBOL(zap_get_stats);
 
-/* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
 	"Maximum micro ZAP size, before converting to a fat ZAP, in bytes");
 #endif