diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h
index 3d4b1920d598..6ffa57c86418 100644
--- a/include/os/linux/zfs/sys/trace_common.h
+++ b/include/os/linux/zfs/sys/trace_common.h
@@ -1,112 +1,110 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * This file contains commonly used trace macros.  Feel free to add and use
  * them in your tracepoint headers.
  */
 
 #ifndef	_SYS_TRACE_COMMON_H
 #define	_SYS_TRACE_COMMON_H
 #include <linux/tracepoint.h>
 
 /* ZIO macros */
 #define	ZIO_TP_STRUCT_ENTRY						\
 		__field(zio_type_t,		zio_type)		\
-		__field(int,			zio_cmd)		\
 		__field(zio_priority_t,		zio_priority)		\
 		__field(uint64_t,		zio_size)		\
 		__field(uint64_t,		zio_orig_size)		\
 		__field(uint64_t,		zio_offset)		\
 		__field(hrtime_t,		zio_timestamp)		\
 		__field(hrtime_t,		zio_delta)		\
 		__field(uint64_t,		zio_delay)		\
 		__field(zio_flag_t,		zio_flags)		\
 		__field(enum zio_stage,		zio_stage)		\
 		__field(enum zio_stage,		zio_pipeline)		\
 		__field(zio_flag_t,		zio_orig_flags)		\
 		__field(enum zio_stage,		zio_orig_stage)		\
 		__field(enum zio_stage,		zio_orig_pipeline)	\
 		__field(uint8_t,		zio_reexecute)		\
 		__field(uint64_t,		zio_txg)		\
 		__field(int,			zio_error)		\
 		__field(uint64_t,		zio_ena)		\
 									\
 		__field(enum zio_checksum,	zp_checksum)		\
 		__field(enum zio_compress,	zp_compress)		\
 		__field(dmu_object_type_t,	zp_type)		\
 		__field(uint8_t,		zp_level)		\
 		__field(uint8_t,		zp_copies)		\
 		__field(boolean_t,		zp_dedup)		\
 		__field(boolean_t,		zp_dedup_verify)	\
 		__field(boolean_t,		zp_nopwrite)
 
 #define	ZIO_TP_FAST_ASSIGN						    \
 		__entry->zio_type		= zio->io_type;		    \
-		__entry->zio_cmd		= zio->io_cmd;		    \
 		__entry->zio_priority		= zio->io_priority;	    \
 		__entry->zio_size		= zio->io_size;		    \
 		__entry->zio_orig_size		= zio->io_orig_size;	    \
 		__entry->zio_offset		= zio->io_offset;	    \
 		__entry->zio_timestamp		= zio->io_timestamp;	    \
 		__entry->zio_delta		= zio->io_delta;	    \
 		__entry->zio_delay		= zio->io_delay;	    \
 		__entry->zio_flags		= zio->io_flags;	    \
 		__entry->zio_stage		= zio->io_stage;	    \
 		__entry->zio_pipeline		= zio->io_pipeline;	    \
 		__entry->zio_orig_flags		= zio->io_orig_flags;	    \
 		__entry->zio_orig_stage		= zio->io_orig_stage;	    \
 		__entry->zio_orig_pipeline	= zio->io_orig_pipeline;    \
 		__entry->zio_reexecute		= zio->io_reexecute;	    \
 		__entry->zio_txg		= zio->io_txg;		    \
 		__entry->zio_error		= zio->io_error;	    \
 		__entry->zio_ena		= zio->io_ena;		    \
 									    \
 		__entry->zp_checksum		= zio->io_prop.zp_checksum; \
 		__entry->zp_compress		= zio->io_prop.zp_compress; \
 		__entry->zp_type		= zio->io_prop.zp_type;	    \
 		__entry->zp_level		= zio->io_prop.zp_level;    \
 		__entry->zp_copies		= zio->io_prop.zp_copies;   \
 		__entry->zp_dedup		= zio->io_prop.zp_dedup;    \
 		__entry->zp_nopwrite		= zio->io_prop.zp_nopwrite; \
 		__entry->zp_dedup_verify	= zio->io_prop.zp_dedup_verify;
 
 #define	ZIO_TP_PRINTK_FMT						\
-	"zio { type %u cmd %i prio %u size %llu orig_size %llu "	\
+	"zio { type %u prio %u size %llu orig_size %llu "		\
 	"offset %llu timestamp %llu delta %llu delay %llu "		\
 	"flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx "	\
 	"orig_stage 0x%x orig_pipeline 0x%x reexecute %u "		\
 	"txg %llu error %d ena %llu prop { checksum %u compress %u "	\
 	"type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }"
 
 #define	ZIO_TP_PRINTK_ARGS						\
-	__entry->zio_type, __entry->zio_cmd, __entry->zio_priority,	\
+	__entry->zio_type, __entry->zio_priority,			\
 	__entry->zio_size, __entry->zio_orig_size, __entry->zio_offset,	\
 	__entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay,	\
 	__entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline,	\
 	__entry->zio_orig_flags, __entry->zio_orig_stage,		\
 	__entry->zio_orig_pipeline, __entry->zio_reexecute,		\
 	__entry->zio_txg, __entry->zio_error, __entry->zio_ena,		\
 	__entry->zp_checksum, __entry->zp_compress, __entry->zp_type,	\
 	__entry->zp_level, __entry->zp_copies, __entry->zp_dedup,	\
 	__entry->zp_dedup_verify, __entry->zp_nopwrite
 
 #endif /* _SYS_TRACE_COMMON_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 5dcd7fe073a0..545b9cf0c3c5 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -1,727 +1,726 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers.
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
 	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_BLAKE3,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1U << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_ON
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ZSTD ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 
 #define	ZIO_COMPRESS_ALGO(x)	(x & SPA_COMPRESSMASK)
 #define	ZIO_COMPRESS_LEVEL(x)	((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS)
 #define	ZIO_COMPRESS_RAW(type, level)	(type | ((level) << SPA_COMPRESSBITS))
 
 #define	ZIO_COMPLEVEL_ZSTD(level)	\
 	ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 typedef enum zio_suspend_reason {
 	ZIO_SUSPEND_NONE = 0,
 	ZIO_SUSPEND_IOERR,
 	ZIO_SUSPEND_MMP,
 } zio_suspend_reason_t;
 
 /*
  * This was originally an enum type. However, those are 32-bit and there is no
  * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
  * forced to upgrade it to a uint64_t.
  */
 typedef uint64_t zio_flag_t;
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 #define	ZIO_FLAG_DONT_AGGREGATE	(1ULL << 0)
 #define	ZIO_FLAG_IO_REPAIR	(1ULL << 1)
 #define	ZIO_FLAG_SELF_HEAL	(1ULL << 2)
 #define	ZIO_FLAG_RESILVER	(1ULL << 3)
 #define	ZIO_FLAG_SCRUB		(1ULL << 4)
 #define	ZIO_FLAG_SCAN_THREAD	(1ULL << 5)
 #define	ZIO_FLAG_PHYSICAL	(1ULL << 6)
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 #define	ZIO_FLAG_CANFAIL	(1ULL << 7)	/* must be first for INHERIT */
 #define	ZIO_FLAG_SPECULATIVE	(1ULL << 8)
 #define	ZIO_FLAG_CONFIG_WRITER	(1ULL << 9)
 #define	ZIO_FLAG_DONT_RETRY	(1ULL << 10)
 #define	ZIO_FLAG_NODATA		(1ULL << 12)
 #define	ZIO_FLAG_INDUCE_DAMAGE	(1ULL << 13)
 #define	ZIO_FLAG_IO_ALLOCATING	(1ULL << 14)
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 #define	ZIO_FLAG_IO_RETRY	(1ULL << 15)	/* must be first for INHERIT */
 #define	ZIO_FLAG_PROBE		(1ULL << 16)
 #define	ZIO_FLAG_TRYHARD	(1ULL << 17)
 #define	ZIO_FLAG_OPTIONAL	(1ULL << 18)
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 #define	ZIO_FLAG_DONT_QUEUE	(1ULL << 19)	/* must be first for INHERIT */
 #define	ZIO_FLAG_DONT_PROPAGATE	(1ULL << 20)
 #define	ZIO_FLAG_IO_BYPASS	(1ULL << 21)
 #define	ZIO_FLAG_IO_REWRITE	(1ULL << 22)
 #define	ZIO_FLAG_RAW_COMPRESS	(1ULL << 23)
 #define	ZIO_FLAG_RAW_ENCRYPT	(1ULL << 24)
 #define	ZIO_FLAG_GANG_CHILD	(1ULL << 25)
 #define	ZIO_FLAG_DDT_CHILD	(1ULL << 26)
 #define	ZIO_FLAG_GODFATHER	(1ULL << 27)
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 28)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 29)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 30)
 
 #define	ZIO_ALLOCATOR_NONE	(-1)
 #define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_CHILD_BIT(x)		(1U << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1U << (x)))
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 #define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
 #define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
 	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_exclude_metadata;
 extern int zio_dva_throttle_enabled;
 extern const char *const zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  *
  * If the head_errlog feature is enabled a different on-disk format for error
  * logs is used. This introduces the use of an error bookmark, a four-tuple
  * <object, level, blkid, birth> that uniquely identifies any error block
  * in the pool. The birth transaction group is used to track whether the block
  * has been overwritten by newer data or added to a snapshot since its marking
  * as an error.
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 struct zbookmark_err_phys {
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 	uint64_t	zb_birth;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	uint8_t			zp_complevel;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
 	dmu_object_type_t	zp_type;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
 	boolean_t		zp_brtwrite;
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 	uint32_t		zp_zpl_smallblk;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_sector;
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
  * be able to propagate them to the parent.  The normal io_flags are local
  * to the zio, not protected by any lock, and not modifiable by children;
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
  * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
  */
 enum trim_flag {
 	ZIO_TRIM_SECURE		= 1U << 0,
 };
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 enum zio_qstate {
 	ZIO_QS_NONE = 0,
 	ZIO_QS_QUEUED,
 	ZIO_QS_ACTIVE,
 };
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
-	int		io_cmd;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
 	enum zio_qstate	io_queue_state;	/* vdev queue state */
 	union {
 		list_node_t l;
 		avl_node_t a;
 	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
 	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	zio_flag_t	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	zio_flag_t	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	void		*io_bio;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 
 	/* write issue taskq selection, based upon sync thread */
 	taskq_t		*io_wr_iss_tq;
 };
 
 enum blk_verify_flag {
 	BLK_VERIFY_ONLY,
 	BLK_VERIFY_LOG,
 	BLK_VERIFY_HALT
 };
 
 enum blk_config_flag {
 	BLK_CONFIG_HELD,   // SCL_VDEV held for writer
 	BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
 	BLK_CONFIG_SKIP,   // skip checks which require SCL_VDEV
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern void zio_destroy(zio_t *zio);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     boolean_t nopwrite, boolean_t brtwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, zio_flag_t flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(void *zio);
 extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, const char *tag);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 extern void zio_add_child_first(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress,
     uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, const char *tag,
     uint64_t type);
 extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
     uint64_t type, int error);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
  */
 extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, const abd_t *good_data, const abd_t *bad_data,
     struct zio_bad_cksum *info);
 
 void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr);
 extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa,
     const char *name);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c
index a65dfec86caf..888c8e7f8863 100644
--- a/module/os/freebsd/zfs/vdev_file.c
+++ b/module/os/freebsd/zfs/vdev_file.c
@@ -1,356 +1,349 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/file.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
 #include <sys/stat.h>
 
 /*
  * Virtual device vector for files.
  */
 
 static taskq_t *vdev_file_taskq;
 
 static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
 static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
 
 void
 vdev_file_init(void)
 {
 	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
 	    minclsyspri, max_ncpus, INT_MAX, 0);
 }
 
 void
 vdev_file_fini(void)
 {
 	taskq_destroy(vdev_file_taskq);
 }
 
 static void
 vdev_file_hold(vdev_t *vd)
 {
 	ASSERT3P(vd->vdev_path, !=, NULL);
 }
 
 static void
 vdev_file_rele(vdev_t *vd)
 {
 	ASSERT3P(vd->vdev_path, !=, NULL);
 }
 
 static mode_t
 vdev_file_open_mode(spa_mode_t spa_mode)
 {
 	mode_t mode = 0;
 
 	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
 		mode = O_RDWR;
 	} else if (spa_mode & SPA_MODE_READ) {
 		mode = O_RDONLY;
 	} else if (spa_mode & SPA_MODE_WRITE) {
 		mode = O_WRONLY;
 	}
 
 	return (mode | O_LARGEFILE);
 }
 
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_file_t *vf;
 	zfs_file_t *fp;
 	zfs_file_attr_t zfa;
 	int error;
 
 	/*
 	 * Rotational optimizations only make sense on block devices.
 	 */
 	vd->vdev_nonrot = B_TRUE;
 
 	/*
 	 * Allow TRIM on file based vdevs.  This may not always be supported,
 	 * since it depends on your kernel version and underlying filesystem
 	 * type but it is always safe to attempt.
 	 */
 	vd->vdev_has_trim = B_TRUE;
 
 	/*
 	 * Disable secure TRIM on file based vdevs.  There is no way to
 	 * request this behavior from the underlying filesystem.
 	 */
 	vd->vdev_has_securetrim = B_FALSE;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it's not currently open.  Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		vf = vd->vdev_tsd;
 		goto skip_open;
 	}
 
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
 	/*
 	 * We always open the files from the root of the global zone, even if
 	 * we're in a local zone.  If the user has gotten to this point, the
 	 * administrator has already decided that the pool should be available
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT3P(vd->vdev_path, !=, NULL);
 	ASSERT(vd->vdev_path[0] == '/');
 
 	error = zfs_file_open(vd->vdev_path,
 	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	vf->vf_file = fp;
 
 #ifdef _KERNEL
 	/*
 	 * Make sure it's a regular file.
 	 */
 	if (zfs_file_getattr(fp, &zfa)) {
 		return (SET_ERROR(ENODEV));
 	}
 	if (!S_ISREG(zfa.zfa_mode)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (SET_ERROR(ENODEV));
 	}
 #endif
 
 skip_open:
 
 	error =  zfs_file_getattr(vf->vf_file, &zfa);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	*max_psize = *psize = zfa.zfa_size;
 	*logical_ashift = vdev_file_logical_ashift;
 	*physical_ashift = vdev_file_physical_ashift;
 
 	return (0);
 }
 
 static void
 vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (vd->vdev_reopening || vf == NULL)
 		return;
 
 	if (vf->vf_file != NULL) {
 		zfs_file_close(vf->vf_file);
 	}
 
 	vd->vdev_delayed_close = B_FALSE;
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
 /*
  * Implements the interrupt side for file vdev types. This routine will be
  * called when the I/O completes allowing us to transfer the I/O to the
  * interrupt taskqs. For consistency, the code structure mimics disk vdev
  * types.
  */
 static void
 vdev_file_io_intr(zio_t *zio)
 {
 	zio_delay_interrupt(zio);
 }
 
 static void
 vdev_file_io_strategy(void *arg)
 {
 	zio_t *zio = arg;
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf;
 	void *buf;
 	ssize_t resid;
 	loff_t off;
 	ssize_t size;
 	int err;
 
 	off = zio->io_offset;
 	size = zio->io_size;
 	resid = 0;
 
 	vf = vd->vdev_tsd;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 	if (zio->io_type == ZIO_TYPE_READ) {
 		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
 		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = ENOSPC;
 
 	vdev_file_io_intr(zio);
 }
 
 static void
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC|O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
-		}
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC);
 
 		zio_execute(zio);
 		return;
 	} else if (zio->io_type == ZIO_TYPE_TRIM) {
 #ifdef notyet
 		int mode = 0;
 
 		ASSERT3U(zio->io_size, !=, 0);
 
 		/* XXX FreeBSD has no fallocate routine in file ops */
 		zio->io_error = zfs_file_fallocate(vf->vf_file,
 		    mode, zio->io_offset, zio->io_size);
 #endif
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_execute(zio);
 		return;
 	}
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
 	    TQ_SLEEP), !=, 0);
 }
 
 static void
 vdev_file_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 vdev_ops_t vdev_file_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 /*
  * From userland we access disks just like files.
  */
 #ifndef _KERNEL
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 #endif
 
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
 	"Logical ashift for file-based devices");
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
 	"Physical ashift for file-based devices");
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 196d67b4b595..264dfa5c9237 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1,1326 +1,1315 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
  */
 
 #include <sys/zfs_context.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_os.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <vm/vm_page.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #ifndef g_topology_locked
 #define	g_topology_locked()	sx_xlocked(&topology_lock)
 #endif
 
 /*
  * Virtual device vector for GEOM.
  */
 
 static g_attrchanged_t vdev_geom_attrchanged;
 struct g_class zfs_vdev_class = {
 	.name = "ZFS::VDEV",
 	.version = G_VERSION,
 	.attrchanged = vdev_geom_attrchanged,
 };
 
 struct consumer_vdev_elem {
 	SLIST_ENTRY(consumer_vdev_elem)	elems;
 	vdev_t	*vd;
 };
 
 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
 _Static_assert(
     sizeof (((struct g_consumer *)NULL)->private) ==
     sizeof (struct consumer_priv_t *),
 	"consumer_priv_t* can't be stored in g_consumer.private");
 
 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
 
 SYSCTL_DECL(_vfs_zfs_vdev);
 /* Don't send BIO_FLUSH. */
 static int vdev_geom_bio_flush_disable;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
 	&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
 /* Don't send BIO_DELETE. */
 static int vdev_geom_bio_delete_disable;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
 	&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
 
 /* Declare local functions */
 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
 
 /*
  * Thread local storage used to indicate when a thread is probing geoms
  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
  * it is looking for a replacement for the vdev_t* that is its value.
  */
 uint_t zfs_geom_probe_vdev_key;
 
 static void
 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
     boolean_t do_null_update)
 {
 	boolean_t needs_update = B_FALSE;
 	char *physpath;
 	int error, physpath_len;
 
 	physpath_len = MAXPATHLEN;
 	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
 	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
 	if (error == 0) {
 		char *old_physpath;
 
 		/* g_topology lock ensures that vdev has not been closed */
 		g_topology_assert();
 		old_physpath = vd->vdev_physpath;
 		vd->vdev_physpath = spa_strdup(physpath);
 
 		if (old_physpath != NULL) {
 			needs_update = (strcmp(old_physpath,
 			    vd->vdev_physpath) != 0);
 			spa_strfree(old_physpath);
 		} else
 			needs_update = do_null_update;
 	}
 	g_free(physpath);
 
 	/*
 	 * If the physical path changed, update the config.
 	 * Only request an update for previously unset physpaths if
 	 * requested by the caller.
 	 */
 	if (needs_update)
 		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
 
 }
 
 static void
 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
 {
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem;
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		return;
 
 	SLIST_FOREACH(elem, priv, elems) {
 		vdev_t *vd = elem->vd;
 		if (strcmp(attr, "GEOM::physpath") == 0) {
 			vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
 			return;
 		}
 	}
 }
 
 static void
 vdev_geom_resize(struct g_consumer *cp)
 {
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem;
 	spa_t *spa;
 	vdev_t *vd;
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		return;
 
 	SLIST_FOREACH(elem, priv, elems) {
 		vd = elem->vd;
 		if (vd->vdev_state != VDEV_STATE_HEALTHY)
 			continue;
 		spa = vd->vdev_spa;
 		if (!spa->spa_autoexpand)
 			continue;
 		vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
 	}
 }
 
 static void
 vdev_geom_orphan(struct g_consumer *cp)
 {
 	struct consumer_priv_t *priv;
 	// cppcheck-suppress uninitvar
 	struct consumer_vdev_elem *elem;
 
 	g_topology_assert();
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		/* Vdev close in progress.  Ignore the event. */
 		return;
 
 	/*
 	 * Orphan callbacks occur from the GEOM event thread.
 	 * Concurrent with this call, new I/O requests may be
 	 * working their way through GEOM about to find out
 	 * (only once executed by the g_down thread) that we've
 	 * been orphaned from our disk provider.  These I/Os
 	 * must be retired before we can detach our consumer.
 	 * This is most easily achieved by acquiring the
 	 * SPA ZIO configuration lock as a writer, but doing
 	 * so with the GEOM topology lock held would cause
 	 * a lock order reversal.  Instead, rely on the SPA's
 	 * async removal support to invoke a close on this
 	 * vdev once it is safe to do so.
 	 */
 	SLIST_FOREACH(elem, priv, elems) {
 		// cppcheck-suppress uninitvar
 		vdev_t *vd = elem->vd;
 
 		vd->vdev_remove_wanted = B_TRUE;
 		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
 	}
 }
 
 static struct g_consumer *
 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Attaching to %s.", pp->name);
 
 	if (sanity) {
 		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
 			ZFS_LOG(1, "Failing attach of %s. "
 			    "Incompatible sectorsize %d\n",
 			    pp->name, pp->sectorsize);
 			return (NULL);
 		} else if (pp->mediasize < SPA_MINDEVSIZE) {
 			ZFS_LOG(1, "Failing attach of %s. "
 			    "Incompatible mediasize %ju\n",
 			    pp->name, pp->mediasize);
 			return (NULL);
 		}
 	}
 
 	/* Do we have geom already? No? Create one. */
 	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
 		if (gp->flags & G_GEOM_WITHER)
 			continue;
 		if (strcmp(gp->name, "zfs::vdev") != 0)
 			continue;
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 		gp->orphan = vdev_geom_orphan;
 		gp->attrchanged = vdev_geom_attrchanged;
 		gp->resize = vdev_geom_resize;
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		if (error != 0) {
 			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
 			    __LINE__, error);
 			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
 		error = g_access(cp, 1, 0, 1);
 		if (error != 0) {
 			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
 			    __LINE__, error);
 			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
 		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
 	} else {
 		/* Check if we are already connected to this provider. */
 		LIST_FOREACH(cp, &gp->consumer, consumer) {
 			if (cp->provider == pp) {
 				ZFS_LOG(1, "Found consumer for %s.", pp->name);
 				break;
 			}
 		}
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
 			error = g_attach(cp, pp);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
 				    __func__, __LINE__, error);
 				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
 			error = g_access(cp, 1, 0, 1);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
 				    __func__, __LINE__, error);
 				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
 			ZFS_LOG(1, "Created consumer for %s.", pp->name);
 		} else {
 			error = g_access(cp, 1, 0, 1);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
 				    __func__, __LINE__, error);
 				return (NULL);
 			}
 			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 		}
 	}
 
 	if (vd != NULL)
 		vd->vdev_tsd = cp;
 
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	return (cp);
 }
 
 static void
 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Detaching from %s.",
 	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
 
 	gp = cp->geom;
 	if (open_for_read)
 		g_access(cp, -1, 0, -1);
 	/* Destroy consumer on last close. */
 	if (cp->acr == 0 && cp->ace == 0) {
 		if (cp->acw > 0)
 			g_access(cp, 0, -cp->acw, 0);
 		if (cp->provider != NULL) {
 			ZFS_LOG(1, "Destroying consumer for %s.",
 			    cp->provider->name ? cp->provider->name : "NULL");
 			g_detach(cp);
 		}
 		g_destroy_consumer(cp);
 	}
 	/* Destroy geom if there are no consumers left. */
 	if (LIST_EMPTY(&gp->consumer)) {
 		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
 		g_wither_geom(gp, ENXIO);
 	}
 }
 
 static void
 vdev_geom_close_locked(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem, *elem_temp;
 
 	g_topology_assert();
 
 	cp = vd->vdev_tsd;
 	vd->vdev_delayed_close = B_FALSE;
 	if (cp == NULL)
 		return;
 
 	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
 	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
 	priv = (struct consumer_priv_t *)&cp->private;
 	vd->vdev_tsd = NULL;
 	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
 		if (elem->vd == vd) {
 			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
 			g_free(elem);
 		}
 	}
 
 	vdev_geom_detach(cp, B_TRUE);
 }
 
 /*
  * Issue one or more bios to the vdev in parallel
  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
  * operation is described by parallel entries from each array.  There may be
  * more bios actually issued than entries in the array
  */
 static void
 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
     off_t *sizes, int *errors, int ncmds)
 {
 	struct bio **bios;
 	uint8_t *p;
 	off_t off, maxio, s, end;
 	int i, n_bios, j;
 	size_t bios_size;
 
 #if __FreeBSD_version > 1300130
 	maxio = maxphys - (maxphys % cp->provider->sectorsize);
 #else
 	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
 #endif
 	n_bios = 0;
 
 	/* How many bios are required for all commands ? */
 	for (i = 0; i < ncmds; i++)
 		n_bios += (sizes[i] + maxio - 1) / maxio;
 
 	/* Allocate memory for the bios */
 	bios_size = n_bios * sizeof (struct bio *);
 	bios = kmem_zalloc(bios_size, KM_SLEEP);
 
 	/* Prepare and issue all of the bios */
 	for (i = j = 0; i < ncmds; i++) {
 		off = offsets[i];
 		p = datas[i];
 		s = sizes[i];
 		end = off + s;
 		ASSERT0(off % cp->provider->sectorsize);
 		ASSERT0(s % cp->provider->sectorsize);
 
 		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
 			bios[j] = g_alloc_bio();
 			bios[j]->bio_cmd = cmds[i];
 			bios[j]->bio_done = NULL;
 			bios[j]->bio_offset = off;
 			bios[j]->bio_length = MIN(s, maxio);
 			bios[j]->bio_data = (caddr_t)p;
 			g_io_request(bios[j], cp);
 		}
 	}
 	ASSERT3S(j, ==, n_bios);
 
 	/* Wait for all of the bios to complete, and clean them up */
 	for (i = j = 0; i < ncmds; i++) {
 		off = offsets[i];
 		s = sizes[i];
 		end = off + s;
 
 		for (; off < end; off += maxio, s -= maxio, j++) {
 			errors[i] = biowait(bios[j], "vdev_geom_io") ||
 			    errors[i];
 			g_destroy_bio(bios[j]);
 		}
 	}
 	kmem_free(bios, bios_size);
 }
 
 /*
  * Read the vdev config from a device.  Return the number of valid labels that
  * were found.  The vdev config will be returned in config if and only if at
  * least one valid label was found.
  */
 static int
 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
 {
 	struct g_provider *pp;
 	nvlist_t *config;
 	vdev_phys_t *vdev_lists[VDEV_LABELS];
 	char *buf;
 	size_t buflen;
 	uint64_t psize, state, txg;
 	off_t offsets[VDEV_LABELS];
 	off_t size;
 	off_t sizes[VDEV_LABELS];
 	int cmds[VDEV_LABELS];
 	int errors[VDEV_LABELS];
 	int l, nlabels;
 
 	g_topology_assert_not();
 
 	pp = cp->provider;
 	ZFS_LOG(1, "Reading config from %s...", pp->name);
 
 	psize = pp->mediasize;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
 	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
 	    ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
 
 	buflen = sizeof (vdev_lists[0]->vp_nvlist);
 
 	/* Create all of the IO requests */
 	for (l = 0; l < VDEV_LABELS; l++) {
 		cmds[l] = BIO_READ;
 		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
 		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
 		sizes[l] = size;
 		errors[l] = 0;
 		ASSERT0(offsets[l] % pp->sectorsize);
 	}
 
 	/* Issue the IO requests */
 	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
 	    VDEV_LABELS);
 
 	/* Parse the labels */
 	config = *configp = NULL;
 	nlabels = 0;
 	for (l = 0; l < VDEV_LABELS; l++) {
 		if (errors[l] != 0)
 			continue;
 
 		buf = vdev_lists[l]->vp_nvlist;
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state > POOL_STATE_L2CACHE) {
 			nvlist_free(config);
 			continue;
 		}
 
 		if (state != POOL_STATE_SPARE &&
 		    state != POOL_STATE_L2CACHE &&
 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0)) {
 			nvlist_free(config);
 			continue;
 		}
 
 		if (*configp != NULL)
 			nvlist_free(*configp);
 		*configp = config;
 		nlabels++;
 	}
 
 	/* Free the label storage */
 	for (l = 0; l < VDEV_LABELS; l++)
 		kmem_free(vdev_lists[l], size);
 
 	return (nlabels);
 }
 
 static void
 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
 {
 	nvlist_t **new_configs;
 	uint64_t i;
 
 	if (id < *count)
 		return;
 	new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
 	    KM_SLEEP);
 	for (i = 0; i < *count; i++)
 		new_configs[i] = (*configs)[i];
 	if (*configs != NULL)
 		kmem_free(*configs, *count * sizeof (void *));
 	*configs = new_configs;
 	*count = id + 1;
 }
 
 static void
 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
     const char *name, uint64_t *known_pool_guid)
 {
 	nvlist_t *vdev_tree;
 	uint64_t pool_guid;
 	uint64_t vdev_guid;
 	uint64_t id, txg, known_txg;
 	const char *pname;
 
 	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
 	    strcmp(pname, name) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
 		goto ignore;
 
 	txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG);
 
 	if (*known_pool_guid != 0) {
 		if (pool_guid != *known_pool_guid)
 			goto ignore;
 	} else
 		*known_pool_guid = pool_guid;
 
 	resize_configs(configs, count, id);
 
 	if ((*configs)[id] != NULL) {
 		known_txg = fnvlist_lookup_uint64((*configs)[id],
 		    ZPOOL_CONFIG_POOL_TXG);
 		if (txg <= known_txg)
 			goto ignore;
 		nvlist_free((*configs)[id]);
 	}
 
 	(*configs)[id] = cfg;
 	return;
 
 ignore:
 	nvlist_free(cfg);
 }
 
 int
 vdev_geom_read_pool_label(const char *name,
     nvlist_t ***configs, uint64_t *count)
 {
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *zcp;
 	nvlist_t *vdev_cfg;
 	uint64_t pool_guid;
 	int nlabels;
 
 	DROP_GIANT();
 	g_topology_lock();
 
 	*configs = NULL;
 	*count = 0;
 	pool_guid = 0;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
 		LIST_FOREACH(gp, &mp->geom, geom) {
 			if (gp->flags & G_GEOM_WITHER)
 				continue;
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				if (pp->flags & G_PF_WITHER)
 					continue;
 				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
 				if (zcp == NULL)
 					continue;
 				g_topology_unlock();
 				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
 				g_topology_lock();
 				vdev_geom_detach(zcp, B_TRUE);
 				if (nlabels == 0)
 					continue;
 				ZFS_LOG(1, "successfully read vdev config");
 
 				process_vdev_config(configs, count,
 				    vdev_cfg, name, &pool_guid);
 			}
 		}
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	return (*count > 0 ? 0 : ENOENT);
 }
 
 enum match {
 	NO_MATCH = 0,		/* No matching labels found */
 	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid */
 	ZERO_MATCH = 1,		/* Should never be returned */
 	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
 	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
 	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
 	FULL_MATCH = 5		/* all labels match the vdev_guid */
 };
 
 static enum match
 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
 {
 	nvlist_t *config;
 	uint64_t pool_guid, top_guid, vdev_guid;
 	struct g_consumer *cp;
 	int nlabels;
 
 	cp = vdev_geom_attach(pp, NULL, B_TRUE);
 	if (cp == NULL) {
 		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
 		    pp->name);
 		return (NO_MATCH);
 	}
 	g_topology_unlock();
 	nlabels = vdev_geom_read_config(cp, &config);
 	g_topology_lock();
 	vdev_geom_detach(cp, B_TRUE);
 	if (nlabels == 0) {
 		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
 		return (NO_MATCH);
 	}
 
 	pool_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
 	top_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
 	vdev_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
 	nvlist_free(config);
 
 	/*
 	 * Check that the label's pool guid matches the desired guid.
 	 * Inactive spares and L2ARCs do not have any pool guid in the label.
 	 */
 	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
 		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
 		    pp->name,
 		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
 		return (NO_MATCH);
 	}
 
 	/*
 	 * Check that the label's vdev guid matches the desired guid.
 	 * The second condition handles possible race on vdev detach, when
 	 * remaining vdev receives GUID of destroyed top level mirror vdev.
 	 */
 	if (vdev_guid == vd->vdev_guid) {
 		ZFS_LOG(1, "guids match for provider %s.", pp->name);
 		return (ZERO_MATCH + nlabels);
 	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
 		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
 		return (TOPGUID_MATCH);
 	}
 	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
 	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
 	return (NO_MATCH);
 }
 
 static struct g_consumer *
 vdev_geom_attach_by_guids(vdev_t *vd)
 {
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_provider *pp, *best_pp;
 	struct g_consumer *cp;
 	const char *vdpath;
 	enum match match, best_match;
 
 	g_topology_assert();
 
 	vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
 	cp = NULL;
 	best_pp = NULL;
 	best_match = NO_MATCH;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
 		LIST_FOREACH(gp, &mp->geom, geom) {
 			if (gp->flags & G_GEOM_WITHER)
 				continue;
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				match = vdev_attach_ok(vd, pp);
 				if (match > best_match) {
 					best_match = match;
 					best_pp = pp;
 				} else if (match == best_match) {
 					if (strcmp(pp->name, vdpath) == 0) {
 						best_pp = pp;
 					}
 				}
 				if (match == FULL_MATCH)
 					goto out;
 			}
 		}
 	}
 
 out:
 	if (best_pp) {
 		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
 		if (cp == NULL) {
 			printf("ZFS WARNING: Unable to attach to %s.\n",
 			    best_pp->name);
 		}
 	}
 	return (cp);
 }
 
 static struct g_consumer *
 vdev_geom_open_by_guids(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	char *buf;
 	size_t len;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
 	    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
 	cp = vdev_geom_attach_by_guids(vd);
 	if (cp != NULL) {
 		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 		buf = kmem_alloc(len, KM_SLEEP);
 
 		snprintf(buf, len, "/dev/%s", cp->provider->name);
 		spa_strfree(vd->vdev_path);
 		vd->vdev_path = buf;
 
 		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
 		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid, cp->provider->name);
 	} else {
 		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
 		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid);
 	}
 
 	return (cp);
 }
 
 static struct g_consumer *
 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = NULL;
 	pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
 	if (pp != NULL) {
 		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
 		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
 			cp = vdev_geom_attach(pp, vd, B_FALSE);
 	}
 
 	return (cp);
 }
 
 static int
 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	int error, has_trim;
 	uint16_t rate;
 
 	/*
 	 * Set the TLS to indicate downstack that we
 	 * should not access zvols
 	 */
 	VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd));
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (EINVAL);
 	}
 
 	/*
 	 * Reopen the device if it's not currently open. Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if ((cp = vd->vdev_tsd) != NULL) {
 		ASSERT(vd->vdev_reopening);
 		goto skip_open;
 	}
 
 	DROP_GIANT();
 	g_topology_lock();
 	error = 0;
 
 	if (vd->vdev_spa->spa_is_splitting ||
 	    ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
 	    (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
 	    vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
 		/*
 		 * We are dealing with a vdev that hasn't been previously
 		 * opened (since boot), and we are not loading an
 		 * existing pool configuration.  This looks like a
 		 * vdev add operation to a new or existing pool.
 		 * Assume the user really wants to do this, and find
 		 * GEOM provider by its name, ignoring GUID mismatches.
 		 *
 		 * XXPOLICY: It would be safer to only allow a device
 		 *           that is unlabeled or labeled but missing
 		 *           GUID information to be opened in this fashion,
 		 *           unless we are doing a split, in which case we
 		 *           should allow any guid.
 		 */
 		cp = vdev_geom_open_by_path(vd, 0);
 	} else {
 		/*
 		 * Try using the recorded path for this device, but only
 		 * accept it if its label data contains the expected GUIDs.
 		 */
 		cp = vdev_geom_open_by_path(vd, 1);
 		if (cp == NULL) {
 			/*
 			 * The device at vd->vdev_path doesn't have the
 			 * expected GUIDs. The disks might have merely
 			 * moved around so try all other GEOM providers
 			 * to find one with the right GUIDs.
 			 */
 			cp = vdev_geom_open_by_guids(vd);
 		}
 	}
 
 	/* Clear the TLS now that tasting is done */
 	VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL));
 
 	if (cp == NULL) {
 		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
 		error = ENOENT;
 	} else {
 		struct consumer_priv_t *priv;
 		struct consumer_vdev_elem *elem;
 		int spamode;
 
 		priv = (struct consumer_priv_t *)&cp->private;
 		if (cp->private == NULL)
 			SLIST_INIT(priv);
 		elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
 		elem->vd = vd;
 		SLIST_INSERT_HEAD(priv, elem, elems);
 
 		spamode = spa_mode(vd->vdev_spa);
 		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
 		    !ISP2(cp->provider->sectorsize)) {
 			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
 			    cp->provider->name);
 
 			vdev_geom_close_locked(vd);
 			error = EINVAL;
 			cp = NULL;
 		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
 			int i;
 
 			for (i = 0; i < 5; i++) {
 				error = g_access(cp, 0, 1, 0);
 				if (error == 0)
 					break;
 				g_topology_unlock();
 				tsleep(vd, 0, "vdev", hz / 2);
 				g_topology_lock();
 			}
 			if (error != 0) {
 				printf("ZFS WARNING: Unable to open %s for "
 				    "writing (error=%d).\n",
 				    cp->provider->name, error);
 				vdev_geom_close_locked(vd);
 				cp = NULL;
 			}
 		}
 	}
 
 	/* Fetch initial physical path information for this device. */
 	if (cp != NULL) {
 		vdev_geom_attrchanged(cp, "GEOM::physpath");
 
 		/* Set other GEOM characteristics */
 		vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
 	}
 
 	g_topology_unlock();
 	PICKUP_GIANT();
 	if (cp == NULL) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
 		    error);
 		return (error);
 	}
 skip_open:
 	pp = cp->provider;
 
 	/*
 	 * Determine the actual size of the device.
 	 */
 	*max_psize = *psize = pp->mediasize;
 
 	/*
 	 * Determine the device's minimum transfer size and preferred
 	 * transfer size.
 	 */
 	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
 	*physical_ashift = 0;
 	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
 	    ISP2(pp->stripesize) && pp->stripeoffset == 0)
 		*physical_ashift = highbit(pp->stripesize) - 1;
 
 	/*
 	 * Clear the nowritecache settings, so that on a vdev_reopen()
 	 * we will try again.
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
 	/* Inform the ZIO pipeline that we are non-rotational. */
 	error = g_getattr("GEOM::rotation_rate", cp, &rate);
 	if (error == 0 && rate == DISK_RR_NON_ROTATING)
 		vd->vdev_nonrot = B_TRUE;
 	else
 		vd->vdev_nonrot = B_FALSE;
 
 	/* Set when device reports it supports TRIM. */
 	error = g_getattr("GEOM::candelete", cp, &has_trim);
 	vd->vdev_has_trim = (error == 0 && has_trim);
 
 	/* Set when device reports it supports secure TRIM. */
 	/* unavailable on FreeBSD */
 	vd->vdev_has_securetrim = B_FALSE;
 
 	return (0);
 }
 
 static void
 vdev_geom_close(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	boolean_t locked;
 
 	cp = vd->vdev_tsd;
 
 	DROP_GIANT();
 	locked = g_topology_locked();
 	if (!locked)
 		g_topology_lock();
 
 	if (!vd->vdev_reopening ||
 	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
 	    (cp->provider != NULL && cp->provider->error != 0))))
 		vdev_geom_close_locked(vd);
 
 	if (!locked)
 		g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 static void
 vdev_geom_io_intr(struct bio *bp)
 {
 	vdev_t *vd;
 	zio_t *zio;
 
 	zio = bp->bio_caller1;
 	vd = zio->io_vd;
 	zio->io_error = bp->bio_error;
 	if (zio->io_error == 0 && bp->bio_resid != 0)
 		zio->io_error = SET_ERROR(EIO);
 
 	switch (zio->io_error) {
 	case ENOTSUP:
 		/*
 		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
 		 * that future attempts will never succeed. In this case
 		 * we set a persistent flag so that we don't bother with
 		 * requests in the future.
 		 */
 		switch (bp->bio_cmd) {
 		case BIO_FLUSH:
 			vd->vdev_nowritecache = B_TRUE;
 			break;
 		case BIO_DELETE:
 			break;
 		}
 		break;
 	case ENXIO:
 		if (!vd->vdev_remove_wanted) {
 			/*
 			 * If provider's error is set we assume it is being
 			 * removed.
 			 */
 			if (bp->bio_to->error != 0) {
 				vd->vdev_remove_wanted = B_TRUE;
 				spa_async_request(zio->io_spa,
 				    SPA_ASYNC_REMOVE);
 			} else if (!vd->vdev_delayed_close) {
 				vd->vdev_delayed_close = B_TRUE;
 			}
 		}
 		break;
 	}
 
 	/*
 	 * We have to split bio freeing into two parts, because the ABD code
 	 * cannot be called in this context and vdev_op_io_done is not called
 	 * for ZIO_TYPE_IOCTL zio-s.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		g_destroy_bio(bp);
 		zio->io_bio = NULL;
 	}
 	zio_delay_interrupt(zio);
 }
 
 struct vdev_geom_check_unmapped_cb_state {
 	int	pages;
 	uint_t	end;
 };
 
 /*
  * Callback to check the ABD segment size/alignment and count the pages.
  * GEOM requires data buffer to look virtually contiguous.  It means only
  * the first page of the buffer may not start and only the last may not
  * end on a page boundary.  All other physical pages must be full.
  */
 static int
 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv)
 {
 	struct vdev_geom_check_unmapped_cb_state *s = priv;
 	vm_offset_t off = (vm_offset_t)buf & PAGE_MASK;
 
 	if (s->pages != 0 && off != 0)
 		return (1);
 	if (s->end != 0)
 		return (1);
 	s->end = (off + len) & PAGE_MASK;
 	s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT;
 	return (0);
 }
 
 /*
  * Check whether we can use unmapped I/O for this ZIO on this device to
  * avoid data copying between scattered and/or gang ABD buffer and linear.
  */
 static int
 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp)
 {
 	struct vdev_geom_check_unmapped_cb_state s;
 
 	/* If unmapped I/O is administratively disabled, respect that. */
 	if (!unmapped_buf_allowed)
 		return (0);
 
 	/* If the buffer is already linear, then nothing to do here. */
 	if (abd_is_linear(zio->io_abd))
 		return (0);
 
 	/*
 	 * If unmapped I/O is not supported by the GEOM provider,
 	 * then we can't do anything and have to copy the data.
 	 */
 	if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0)
 		return (0);
 
 	/* Check the buffer chunks sizes/alignments and count pages. */
 	s.pages = s.end = 0;
 	if (abd_iterate_func(zio->io_abd, 0, zio->io_size,
 	    vdev_geom_check_unmapped_cb, &s))
 		return (0);
 	return (s.pages);
 }
 
 /*
  * Callback to translate the ABD segment into array of physical pages.
  */
 static int
 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv)
 {
 	struct bio *bp = priv;
 	vm_offset_t addr = (vm_offset_t)buf;
 	vm_offset_t end = addr + len;
 
 	if (bp->bio_ma_n == 0) {
 		bp->bio_ma_offset = addr & PAGE_MASK;
 		addr &= ~PAGE_MASK;
 	} else {
 		ASSERT0(P2PHASE(addr, PAGE_SIZE));
 	}
 	do {
 		bp->bio_ma[bp->bio_ma_n++] =
 		    PHYS_TO_VM_PAGE(pmap_kextract(addr));
 		addr += PAGE_SIZE;
 	} while (addr < end);
 	return (0);
 }
 
 static void
 vdev_geom_io_start(zio_t *zio)
 {
 	vdev_t *vd;
 	struct g_consumer *cp;
 	struct bio *bp;
 
 	vd = zio->io_vd;
 
-	switch (zio->io_type) {
-	case ZIO_TYPE_IOCTL:
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
-		} else {
-			switch (zio->io_cmd) {
-			case DKIOCFLUSHWRITECACHE:
-				if (zfs_nocacheflush ||
-				    vdev_geom_bio_flush_disable)
-					break;
-				if (vd->vdev_nowritecache) {
-					zio->io_error = SET_ERROR(ENOTSUP);
-					break;
-				}
-				goto sendreq;
-			default:
-				zio->io_error = SET_ERROR(ENOTSUP);
-			}
 		}
 
-		zio_execute(zio);
-		return;
-	case ZIO_TYPE_TRIM:
-		if (!vdev_geom_bio_delete_disable) {
-			goto sendreq;
+		if (zfs_nocacheflush || vdev_geom_bio_flush_disable) {
+			zio_execute(zio);
+			return;
+		}
+
+		if (vd->vdev_nowritecache) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+			zio_execute(zio);
+			return;
+		}
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		if (vdev_geom_bio_delete_disable) {
+			zio_execute(zio);
+			return;
 		}
-		zio_execute(zio);
-		return;
-	default:
-			;
-		/* PASSTHROUGH --- placate compiler */
 	}
-sendreq:
+
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM ||
 	    zio->io_type == ZIO_TYPE_IOCTL);
 
 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
 		zio->io_error = SET_ERROR(ENXIO);
 		zio_interrupt(zio);
 		return;
 	}
 	bp = g_alloc_bio();
 	bp->bio_caller1 = zio;
 	switch (zio->io_type) {
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		if (zio->io_type == ZIO_TYPE_READ)
 			bp->bio_cmd = BIO_READ;
 		else
 			bp->bio_cmd = BIO_WRITE;
 
 		/*
 		 * If possible, represent scattered and/or gang ABD buffer to
 		 * GEOM as an array of physical pages.  It allows to satisfy
 		 * requirement of virtually contiguous buffer without copying.
 		 */
 		int pgs = vdev_geom_check_unmapped(zio, cp);
 		if (pgs > 0) {
 			bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs,
 			    M_DEVBUF, M_WAITOK);
 			bp->bio_ma_n = 0;
 			bp->bio_ma_offset = 0;
 			abd_iterate_func(zio->io_abd, 0, zio->io_size,
 			    vdev_geom_fill_unmap_cb, bp);
 			bp->bio_data = unmapped_buf;
 			bp->bio_flags |= BIO_UNMAPPED;
 		} else {
 			if (zio->io_type == ZIO_TYPE_READ) {
 				bp->bio_data = abd_borrow_buf(zio->io_abd,
 				    zio->io_size);
 			} else {
 				bp->bio_data = abd_borrow_buf_copy(zio->io_abd,
 				    zio->io_size);
 			}
 		}
 		break;
 	case ZIO_TYPE_TRIM:
 		bp->bio_cmd = BIO_DELETE;
 		bp->bio_data = NULL;
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
 	case ZIO_TYPE_IOCTL:
 		bp->bio_cmd = BIO_FLUSH;
 		bp->bio_data = NULL;
 		bp->bio_offset = cp->provider->mediasize;
 		bp->bio_length = 0;
 		break;
 	default:
 		panic("invalid zio->io_type: %d\n", zio->io_type);
 	}
 	bp->bio_done = vdev_geom_io_intr;
 	zio->io_bio = bp;
 
 	g_io_request(bp, cp);
 }
 
 static void
 vdev_geom_io_done(zio_t *zio)
 {
 	struct bio *bp = zio->io_bio;
 
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		ASSERT3P(bp, ==, NULL);
 		return;
 	}
 
 	if (bp == NULL) {
 		ASSERT3S(zio->io_error, ==, ENXIO);
 		return;
 	}
 
 	if (bp->bio_ma != NULL) {
 		free(bp->bio_ma, M_DEVBUF);
 	} else {
 		if (zio->io_type == ZIO_TYPE_READ) {
 			abd_return_buf_copy(zio->io_abd, bp->bio_data,
 			    zio->io_size);
 		} else {
 			abd_return_buf(zio->io_abd, bp->bio_data,
 			    zio->io_size);
 		}
 	}
 
 	g_destroy_bio(bp);
 	zio->io_bio = NULL;
 }
 
 static void
 vdev_geom_hold(vdev_t *vd)
 {
 }
 
 static void
 vdev_geom_rele(vdev_t *vd)
 {
 }
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_geom_open,
 	.vdev_op_close = vdev_geom_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_geom_io_start,
 	.vdev_op_io_done = vdev_geom_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_geom_hold,
 	.vdev_op_rele = vdev_geom_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index f3f0c0875210..554ed22b9df8 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1,1667 +1,1658 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <linux/blkpg.h>
 #include <linux/msdos_fs.h>
 #include <linux/vfs_compat.h>
 #ifdef HAVE_LINUX_BLK_CGROUP_HEADER
 #include <linux/blk-cgroup.h>
 #endif
 
 /*
  * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
  * block_device. Since it carries the block_device inside, its convenient to
  * just use the handle as a proxy.
  *
  * Linux 6.9.x uses a file for the same purpose.
  *
  * For pre-6.8, we just emulate this with a cast, since we don't need any of
  * the other fields inside the handle.
  */
 #if defined(HAVE_BDEV_OPEN_BY_PATH)
 typedef struct bdev_handle zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((bdh)->bdev)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 typedef struct file zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		(file_bdev(bdh))
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #else
 typedef void zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((struct block_device *)bdh)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(BDH_BDEV(bdh)))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(BDH_BDEV(bdh)))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #endif
 
 typedef struct vdev_disk {
 	zfs_bdev_handle_t		*vd_bdh;
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
 /*
  * Maximum number of segments to add to a bio (min 4). If this is higher than
  * the maximum allowed by the device queue or the kernel itself, it will be
  * clamped. Setting it to zero will cause the kernel's ideal size to be used.
  */
 uint_t zfs_vdev_disk_max_segs = 0;
 
 /*
  * Unique identifier for the exclusive vdev holder.
  */
 static void *zfs_vdev_holder = VDEV_HOLDER;
 
 /*
  * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
  * device is missing. The missing path may be transient since the links
  * can be briefly removed and recreated in response to udev events.
  */
 static uint_t zfs_vdev_open_timeout_ms = 1000;
 
 /*
  * Size of the "reserved" partition, in blocks.
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
 /*
  * BIO request failfast mask.
  */
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
 /*
  * Convert SPA mode flags into bdev open mode flags.
  */
 #ifdef HAVE_BLK_MODE_T
 typedef blk_mode_t vdev_bdev_mode_t;
 #define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
 #define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
 #define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
 #define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
 #else
 typedef fmode_t vdev_bdev_mode_t;
 #define	VDEV_BDEV_MODE_READ	FMODE_READ
 #define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
 #define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
 #define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
 #endif
 
 static vdev_bdev_mode_t
 vdev_bdev_mode(spa_mode_t smode)
 {
 	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
 	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
 
 	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
 
 	if (smode & SPA_MODE_READ)
 		bmode |= VDEV_BDEV_MODE_READ;
 
 	if (smode & SPA_MODE_WRITE)
 		bmode |= VDEV_BDEV_MODE_WRITE;
 
 	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
 	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
 
 	return (bmode);
 }
 
 /*
  * Returns the usable capacity (in bytes) for the partition or disk.
  */
 static uint64_t
 bdev_capacity(struct block_device *bdev)
 {
 	return (i_size_read(bdev->bd_inode));
 }
 
 #if !defined(HAVE_BDEV_WHOLE)
 static inline struct block_device *
 bdev_whole(struct block_device *bdev)
 {
 	return (bdev->bd_contains);
 }
 #endif
 
 #if defined(HAVE_BDEVNAME)
 #define	vdev_bdevname(bdev, name)	bdevname(bdev, name)
 #else
 static inline void
 vdev_bdevname(struct block_device *bdev, char *name)
 {
 	snprintf(name, BDEVNAME_SIZE, "%pg", bdev);
 }
 #endif
 
 /*
  * Returns the maximum expansion capacity of the block device (in bytes).
  *
  * It is possible to expand a vdev when it has been created as a wholedisk
  * and the containing block device has increased in capacity.  Or when the
  * partition containing the pool has been manually increased in size.
  *
  * This function is only responsible for calculating the potential expansion
  * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
  * responsible for verifying the expected partition layout in the wholedisk
  * case, and updating the partition table if appropriate.  Once the partition
  * size has been increased the additional capacity will be visible using
  * bdev_capacity().
  *
  * The returned maximum expansion capacity is always expected to be larger, or
  * at the very least equal, to its usable capacity to prevent overestimating
  * the pool expandsize.
  */
 static uint64_t
 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
 {
 	uint64_t psize;
 	int64_t available;
 
 	if (wholedisk && bdev != bdev_whole(bdev)) {
 		/*
 		 * When reporting maximum expansion capacity for a wholedisk
 		 * deduct any capacity which is expected to be lost due to
 		 * alignment restrictions.  Over reporting this value isn't
 		 * harmful and would only result in slightly less capacity
 		 * than expected post expansion.
 		 * The estimated available space may be slightly smaller than
 		 * bdev_capacity() for devices where the number of sectors is
 		 * not a multiple of the alignment size and the partition layout
 		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
 		 * "reserved" EFI partition: in such cases return the device
 		 * usable capacity.
 		 */
 		available = i_size_read(bdev_whole(bdev)->bd_inode) -
 		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
 		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
 		psize = MAX(available, bdev_capacity(bdev));
 	} else {
 		psize = bdev_capacity(bdev);
 	}
 
 	return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
 	/*
 	 * This function can be called in interrupt context, for instance while
 	 * handling IRQs coming from a misbehaving disk device; use printk()
 	 * which is safe from any context.
 	 */
 	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
 	    "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa),
 	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
 	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
 	    zio->io_flags);
 }
 
 static void
 vdev_disk_kobj_evt_post(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 	if (vd && vd->vd_bdh) {
 		spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh));
 	} else {
 		vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
 		    v->vdev_path);
 	}
 }
 
 static zfs_bdev_handle_t *
 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
 	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
 
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 	return (bdev_file_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BDEV_OPEN_BY_PATH)
 	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
 	return (blkdev_get_by_path(path, bmode, holder, NULL));
 #else
 	return (blkdev_get_by_path(path, bmode, holder));
 #endif
 }
 
 static void
 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 {
 #if defined(HAVE_BDEV_RELEASE)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
 #elif defined(HAVE_BLKDEV_PUT)
 	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
 #else
 	fput(bdh);
 #endif
 }
 
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	zfs_bdev_handle_t *bdh;
 	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		vdev_dbgmsg(v, "invalid vdev_path");
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it is currently open.  When expanding a
 	 * partition force re-scanning the partition table if userland
 	 * did not take care of this already. We need to do this while closed
 	 * in order to get an accurate updated block device size.  Then
 	 * since udev may need to recreate the device links increase the
 	 * open retry timeout before reporting the device as unavailable.
 	 */
 	vd = v->vdev_tsd;
 	if (vd) {
 		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
 		boolean_t reread_part = B_FALSE;
 
 		rw_enter(&vd->vd_lock, RW_WRITER);
 		bdh = vd->vd_bdh;
 		vd->vd_bdh = NULL;
 
 		if (bdh) {
 			struct block_device *bdev = BDH_BDEV(bdh);
 			if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
 				vdev_bdevname(bdev_whole(bdev), disk_name + 5);
 				/*
 				 * If userland has BLKPG_RESIZE_PARTITION,
 				 * then it should have updated the partition
 				 * table already. We can detect this by
 				 * comparing our current physical size
 				 * with that of the device. If they are
 				 * the same, then we must not have
 				 * BLKPG_RESIZE_PARTITION or it failed to
 				 * update the partition table online. We
 				 * fallback to rescanning the partition
 				 * table from the kernel below. However,
 				 * if the capacity already reflects the
 				 * updated partition, then we skip
 				 * rescanning the partition table here.
 				 */
 				if (v->vdev_psize == bdev_capacity(bdev))
 					reread_part = B_TRUE;
 			}
 
 			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
 			bdh = vdev_blkdev_get_by_path(disk_name, smode,
 			    zfs_vdev_holder);
 			if (!BDH_IS_ERR(bdh)) {
 				int error =
 				    vdev_bdev_reread_part(BDH_BDEV(bdh));
 				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
 				}
 			}
 		}
 	} else {
 		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
 
 		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
 		rw_enter(&vd->vd_lock, RW_WRITER);
 	}
 
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
 	 * then drives may be re-cabled without an issue.  If the provided
 	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
 	 * locations to maximize the systems tolerance to component failure.
 	 *
 	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reordered due to probing order.
 	 * Devices in the wrong locations will be detected by the higher
 	 * level vdev validation.
 	 *
 	 * The specified paths may be briefly removed and recreated in
 	 * response to udev events.  This should be exceptionally unlikely
 	 * because the zpool command makes every effort to verify these paths
 	 * have already settled prior to reaching this point.  Therefore,
 	 * a ENOENT failure at this point is highly likely to be transient
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
 	 *
 	 * When ERESTARTSYS is returned it indicates the block device is
 	 * a zvol which could not be opened due to the deadlock detection
 	 * logic in zvol_open().  Extend the timeout and retry the open
 	 * subsequent attempts are expected to eventually succeed.
 	 */
 	hrtime_t start = gethrtime();
 	bdh = BDH_ERR_PTR(-ENXIO);
 	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
 		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
 		    zfs_vdev_holder);
 		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
 			/*
 			 * There is no point of waiting since device is removed
 			 * explicitly
 			 */
 			if (v->vdev_removed)
 				break;
 
 			schedule_timeout(MSEC_TO_TICK(10));
 		} else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
 			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
 			continue;
 		} else if (BDH_IS_ERR(bdh)) {
 			break;
 		}
 	}
 
 	if (BDH_IS_ERR(bdh)) {
 		int error = -BDH_PTR_ERR(bdh);
 		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
 		    (u_longlong_t)(gethrtime() - start),
 		    (u_longlong_t)timeout);
 		vd->vd_bdh = NULL;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 		return (SET_ERROR(error));
 	} else {
 		vd->vd_bdh = bdh;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 	}
 
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 
 	/*  Determine the physical block size */
 	int physical_block_size = bdev_physical_block_size(bdev);
 
 	/*  Determine the logical block size */
 	int logical_block_size = bdev_logical_block_size(bdev);
 
 	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
 	v->vdev_nowritecache = B_FALSE;
 
 	/* Set when device reports it supports TRIM. */
 	v->vdev_has_trim = bdev_discard_supported(bdev);
 
 	/* Set when device reports it supports secure TRIM. */
 	v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
 
 	/* Inform the ZIO pipeline that we are non-rotational */
 	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
 
 	/* Physical volume size in bytes for the partition */
 	*psize = bdev_capacity(bdev);
 
 	/* Physical volume size in bytes including possible expansion space */
 	*max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk);
 
 	/* Based on the minimum sector size set the block size */
 	*physical_ashift = highbit64(MAX(physical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	*logical_ashift = highbit64(MAX(logical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	return (0);
 }
 
 static void
 vdev_disk_close(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
 	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
 	v->vdev_tsd = NULL;
 }
 
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
 #ifdef HAVE_1ARG_SUBMIT_BIO
 	(void) submit_bio(bio);
 #else
 	(void) submit_bio(bio_data_dir(bio), bio);
 #endif
 }
 
 /*
  * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
  * replace it with preempt_schedule under the following condition:
  */
 #if defined(CONFIG_ARM64) && \
     defined(CONFIG_PREEMPTION) && \
     defined(CONFIG_BLK_CGROUP)
 #define	preempt_schedule_notrace(x) preempt_schedule(x)
 #endif
 
 /*
  * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct
  * as an argument removing the need to set it with bio_set_dev().  This
  * removes the need for all of the following compatibility code.
  */
 #if !defined(HAVE_BIO_ALLOC_4ARG)
 
 #ifdef HAVE_BIO_SET_DEV
 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
 /*
  * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
  * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
  * As a side effect the function was converted to GPL-only.  Define our
  * own version when needed which uses rcu_read_lock_sched().
  *
  * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public
  * part, moving blkg_tryget into the private one. Define our own version.
  */
 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET)
 static inline bool
 vdev_blkg_tryget(struct blkcg_gq *blkg)
 {
 	struct percpu_ref *ref = &blkg->refcnt;
 	unsigned long __percpu *count;
 	bool rc;
 
 	rcu_read_lock_sched();
 
 	if (__ref_is_percpu(ref, &count)) {
 		this_cpu_inc(*count);
 		rc = true;
 	} else {
 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
 		rc = atomic_long_inc_not_zero(&ref->data->count);
 #else
 		rc = atomic_long_inc_not_zero(&ref->count);
 #endif
 	}
 
 	rcu_read_unlock_sched();
 
 	return (rc);
 }
 #else
 #define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
 #endif
 #ifdef HAVE_BIO_SET_DEV_MACRO
 /*
  * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
  * GPL-only bio_associate_blkg() symbol thus inadvertently converting
  * the entire macro.  Provide a minimal version which always assigns the
  * request queue's root_blkg to the bio.
  */
 static inline void
 vdev_bio_associate_blkg(struct bio *bio)
 {
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 
 	ASSERT3P(q, !=, NULL);
 	ASSERT3P(bio->bi_blkg, ==, NULL);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
 }
 
 #define	bio_associate_blkg vdev_bio_associate_blkg
 #else
 static inline void
 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
 {
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 	bio_clear_flag(bio, BIO_REMAPPED);
 	if (bio->bi_bdev != bdev)
 		bio_clear_flag(bio, BIO_THROTTLED);
 	bio->bi_bdev = bdev;
 
 	ASSERT3P(q, !=, NULL);
 	ASSERT3P(bio->bi_blkg, ==, NULL);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
 }
 #define	bio_set_dev		vdev_bio_set_dev
 #endif
 #endif
 #else
 /*
  * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
  */
 static inline void
 bio_set_dev(struct bio *bio, struct block_device *bdev)
 {
 	bio->bi_bdev = bdev;
 }
 #endif /* HAVE_BIO_SET_DEV */
 #endif /* !HAVE_BIO_ALLOC_4ARG */
 
 static inline void
 vdev_submit_bio(struct bio *bio)
 {
 	struct bio_list *bio_list = current->bio_list;
 	current->bio_list = NULL;
 	vdev_submit_bio_impl(bio);
 	current->bio_list = bio_list;
 }
 
 static inline struct bio *
 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
     unsigned short nr_vecs)
 {
 	struct bio *bio;
 
 #ifdef HAVE_BIO_ALLOC_4ARG
 	bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask);
 #else
 	bio = bio_alloc(gfp_mask, nr_vecs);
 	if (likely(bio != NULL))
 		bio_set_dev(bio, bdev);
 #endif
 
 	return (bio);
 }
 
 static inline uint_t
 vdev_bio_max_segs(struct block_device *bdev)
 {
 	/*
 	 * Smallest of the device max segs and the tuneable max segs. Minimum
 	 * 4, so there's room to finish split pages if they come up.
 	 */
 	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
 	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
 	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
 	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
 
 #ifdef HAVE_BIO_MAX_SEGS
 	return (bio_max_segs(max_segs));
 #else
 	return (MIN(max_segs, BIO_MAX_PAGES));
 #endif
 }
 
 static inline uint_t
 vdev_bio_max_bytes(struct block_device *bdev)
 {
 	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
 }
 
 
 /*
  * Virtual block IO object (VBIO)
  *
  * Linux block IO (BIO) objects have a limit on how many data segments (pages)
  * they can hold. Depending on how they're allocated and structured, a large
  * ZIO can require more than one BIO to be submitted to the kernel, which then
  * all have to complete before we can return the completed ZIO back to ZFS.
  *
  * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
  * translate a ZIO down into the kernel block layer and back again.
  *
  * Note that these are only used for data ZIOs (read/write). Meta-operations
  * (flush/trim) don't need multiple BIOs and so can just make the call
  * directly.
  */
 typedef struct {
 	zio_t		*vbio_zio;	/* parent zio */
 
 	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
 	uint_t		vbio_lbs_mask;	/* logical block size mask */
 
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
 	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
 	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
 	vbio->vbio_bio = NULL;
 	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
 BIO_END_IO_PROTO(vbio_completion, bio, error);
 
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
 	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
 			VERIFY(bio);
 
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
 			bio_set_op_attrs(bio,
 			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
 			    WRITE : READ, vbio->vbio_flags);
 
 			if (vbio->vbio_bio) {
 				bio_chain(vbio->vbio_bio, bio);
 				vdev_submit_bio(vbio->vbio_bio);
 			}
 			vbio->vbio_bio = bio;
 		}
 
 		/*
 		 * Only load as much of the current page data as will fit in
 		 * the space left in the BIO, respecting lbs alignment. Older
 		 * kernels will error if we try to overfill the BIO, while
 		 * newer ones will accept it and split the BIO. This ensures
 		 * everything works on older kernels, and avoids an additional
 		 * overhead on the new.
 		 */
 		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
 		    vbio->vbio_lbs_mask);
 		if (ssize > 0 &&
 		    bio_add_page(bio, page, ssize, offset) == ssize) {
 			/* Accepted, adjust and load any remaining. */
 			size -= ssize;
 			offset += ssize;
 			continue;
 		}
 
 		/* No room, set up for a new BIO and loop */
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
 		bio = NULL;
 	}
 
 	return (0);
 }
 
 /* Iterator callback to submit ABD pages to the vbio. */
 static int
 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 {
 	vbio_t *vbio = priv;
 	return (vbio_add_page(vbio, page, len, off));
 }
 
 /* Create some BIOs, fill them with data and submit them */
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
 	 * plug, then the kernel may start executing earlier BIOs while we're
 	 * still creating and executing later ones, and if the device goes
 	 * away while that's happening, older kernels can get confused and
 	 * trample memory.
 	 */
 	struct blk_plug plug;
 	blk_start_plug(&plug);
 
 	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
 	ASSERT(vbio->vbio_bio);
 
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 
 	/*
 	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
 	 * can't touch it again. The bio may complete and vbio_completion() be
 	 * called and free the vbio before this task is run again, so we must
 	 * consider it invalid from this point.
 	 */
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
 }
 
 /* IO completion callback */
 BIO_END_IO_PROTO(vbio_completion, bio, error)
 {
 	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
 
 	ASSERT(zio);
 
 	/* Capture and log any errors */
 #ifdef HAVE_1ARG_BIO_END_IO_T
 	zio->io_error = BIO_END_IO_ERROR(bio);
 #else
 	zio->io_error = 0;
 	if (error)
 		zio->io_error = -(error);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		zio->io_error = EIO;
 #endif
 	ASSERT3U(zio->io_error, >=, 0);
 
 	if (zio->io_error)
 		vdev_disk_error(zio);
 
 	/* Return the BIO to the kernel */
 	bio_put(bio);
 
 	/*
 	 * If we copied the ABD before issuing it, clean up and return the copy
 	 * to the ADB, with changes if appropriate.
 	 */
 	if (vbio->vbio_abd != NULL) {
 		void *buf = abd_to_buf(vbio->vbio_abd);
 		abd_free(vbio->vbio_abd);
 		vbio->vbio_abd = NULL;
 
 		if (zio->io_type == ZIO_TYPE_READ)
 			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
 		else
 			abd_return_buf(zio->io_abd, buf, zio->io_size);
 	}
 
 	/* Final cleanup */
 	kmem_free(vbio, sizeof (vbio_t));
 
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
 }
 
 /*
  * Iterator callback to count ABD pages and check their size & alignment.
  *
  * On Linux, each BIO segment can take a page pointer, and an offset+length of
  * the data within that page. A page can be arbitrarily large ("compound"
  * pages) but we still have to ensure the data portion is correctly sized and
  * aligned to the logical block size, to ensure that if the kernel wants to
  * split the BIO, the two halves will still be properly aligned.
  *
  * NOTE: if you change this function, change the copy in
  * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
  * data there to validate the change you're making.
  *
  */
 typedef struct {
 	uint_t  bmask;
 	uint_t  npages;
 	uint_t  end;
 } vdev_disk_check_pages_t;
 
 static int
 vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
 {
 	(void) page;
 	vdev_disk_check_pages_t *s = priv;
 
 	/*
 	 * If we didn't finish on a block size boundary last time, then there
 	 * would be a gap if we tried to use this ABD as-is, so abort.
 	 */
 	if (s->end != 0)
 		return (1);
 
 	/*
 	 * Note if we're taking less than a full block, so we can check it
 	 * above on the next call.
 	 */
 	s->end = (off+len) & s->bmask;
 
 	/* All blocks after the first must start on a block size boundary. */
 	if (s->npages != 0 && (off & s->bmask) != 0)
 		return (1);
 
 	s->npages++;
 	return (0);
 }
 
 /*
  * Check if we can submit the pages in this ABD to the kernel as-is. Returns
  * the number of pages, or 0 if it can't be submitted like this.
  */
 static boolean_t
 vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 {
 	vdev_disk_check_pages_t s = {
 	    .bmask = bdev_logical_block_size(bdev)-1,
 	    .npages = 0,
 	    .end = 0,
 	};
 
 	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 	int flags = 0;
 
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
 	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)zio->io_offset,
 		    (u_longlong_t)zio->io_size,
 		    (u_longlong_t)i_size_read(bdev->bd_inode));
 		return (SET_ERROR(EIO));
 	}
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    v->vdev_failfast == B_TRUE) {
 		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 	}
 
 	/*
 	 * Check alignment of the incoming ABD. If any part of it would require
 	 * submitting a page that is not aligned to the logical block size,
 	 * then we take a copy into a linear buffer and submit that instead.
 	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
 	 * usually requiring abnormally-small data blocks (eg gang blocks)
 	 * mixed into the same ABD as larger ones (eg aggregated).
 	 */
 	abd_t *abd = zio->io_abd;
 	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
 		void *buf;
 		if (zio->io_type == ZIO_TYPE_READ)
 			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
 		else
 			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 
 		/*
 		 * Wrap the copy in an abd_t, so we can use the same iterators
 		 * to count and fill the vbio later.
 		 */
 		abd = abd_get_from_buf(buf, zio->io_size);
 
 		/*
 		 * False here would mean the borrowed copy has an invalid
 		 * alignment too, which would mean we've somehow been passed a
 		 * linear ABD with an interior page that has a non-zero offset
 		 * or a size not a multiple of PAGE_SIZE. This is not possible.
 		 * It would mean either zio_buf_alloc() or its underlying
 		 * allocators have done something extremely strange, or our
 		 * math in vdev_disk_check_pages() is wrong. In either case,
 		 * something in seriously wrong and its not safe to continue.
 		 */
 		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
 	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
 	/* Fill it with data pages and submit it to the kernel */
 	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 
 /* ========== */
 
 /*
  * This is the classic, battle-tested BIO submission code. Until we're totally
  * sure that the new code is safe and correct in all cases, this will remain
  * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
  * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
  */
 
 /*
  * Virtual device vector for disks.
  */
 typedef struct dio_request {
 	zio_t			*dr_zio;	/* Parent ZIO */
 	atomic_t		dr_ref;		/* References */
 	int			dr_error;	/* Bio error */
 	int			dr_bio_count;	/* Count of bio's */
 	struct bio		*dr_bio[];	/* Attached bio's */
 } dio_request_t;
 
 static dio_request_t *
 vdev_classic_dio_alloc(int bio_count)
 {
 	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
 	    sizeof (struct bio *) * bio_count, KM_SLEEP);
 	atomic_set(&dr->dr_ref, 0);
 	dr->dr_bio_count = bio_count;
 	dr->dr_error = 0;
 
 	for (int i = 0; i < dr->dr_bio_count; i++)
 		dr->dr_bio[i] = NULL;
 
 	return (dr);
 }
 
 static void
 vdev_classic_dio_free(dio_request_t *dr)
 {
 	int i;
 
 	for (i = 0; i < dr->dr_bio_count; i++)
 		if (dr->dr_bio[i])
 			bio_put(dr->dr_bio[i]);
 
 	kmem_free(dr, sizeof (dio_request_t) +
 	    sizeof (struct bio *) * dr->dr_bio_count);
 }
 
 static void
 vdev_classic_dio_get(dio_request_t *dr)
 {
 	atomic_inc(&dr->dr_ref);
 }
 
 static void
 vdev_classic_dio_put(dio_request_t *dr)
 {
 	int rc = atomic_dec_return(&dr->dr_ref);
 
 	/*
 	 * Free the dio_request when the last reference is dropped and
 	 * ensure zio_interpret is called only once with the correct zio
 	 */
 	if (rc == 0) {
 		zio_t *zio = dr->dr_zio;
 		int error = dr->dr_error;
 
 		vdev_classic_dio_free(dr);
 
 		if (zio) {
 			zio->io_error = error;
 			ASSERT3S(zio->io_error, >=, 0);
 			if (zio->io_error)
 				vdev_disk_error(zio);
 
 			zio_delay_interrupt(zio);
 		}
 	}
 }
 
 BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
 {
 	dio_request_t *dr = bio->bi_private;
 
 	if (dr->dr_error == 0) {
 #ifdef HAVE_1ARG_BIO_END_IO_T
 		dr->dr_error = BIO_END_IO_ERROR(bio);
 #else
 		if (error)
 			dr->dr_error = -(error);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			dr->dr_error = EIO;
 #endif
 	}
 
 	/* Drop reference acquired by vdev_classic_physio */
 	vdev_classic_dio_put(dr);
 }
 
 static inline unsigned int
 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
 
 #ifdef HAVE_BIO_MAX_SEGS
 	return (bio_max_segs(nr_segs));
 #else
 	return (MIN(nr_segs, BIO_MAX_PAGES));
 #endif
 }
 
 static int
 vdev_classic_physio(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 	size_t io_size = zio->io_size;
 	uint64_t io_offset = zio->io_offset;
 	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
 	int flags = 0;
 
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
 	int bio_size;
 	int bio_count = 16;
 	int error = 0;
 	struct blk_plug plug;
 	unsigned short nr_vecs;
 
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
 	if (io_offset + io_size > bdev->bd_inode->i_size) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)io_offset,
 		    (u_longlong_t)io_size,
 		    (u_longlong_t)i_size_read(bdev->bd_inode));
 		return (SET_ERROR(EIO));
 	}
 
 retry:
 	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
 		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 	}
 
 	dr->dr_zio = zio;
 
 	/*
 	 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
 	 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
 	 * can cover at least 128KB and at most 1MB.  When the required number
 	 * of iovec's exceeds this, we are forced to break the IO in multiple
 	 * bio's and wait for them all to complete.  This is likely if the
 	 * recordsize property is increased beyond 1MB.  The default
 	 * bio_count=16 should typically accommodate the maximum-size zio of
 	 * 16MB.
 	 */
 
 	abd_offset = 0;
 	bio_offset = io_offset;
 	bio_size = io_size;
 	for (int i = 0; i <= dr->dr_bio_count; i++) {
 
 		/* Finished constructing bio's for given buffer */
 		if (bio_size <= 0)
 			break;
 
 		/*
 		 * If additional bio's are required, we have to retry, but
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
 			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
 		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
 			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
 		/* Matching put called by vdev_classic_physio_completion */
 		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
 		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
 		/* Remaining size is returned to become the new size */
 		bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
 		    bio_size, abd_offset);
 
 		/* Advance in buffer and construct another bio if needed */
 		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
 	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
 
 	/* Submit all bio's associated with this dio */
 	for (int i = 0; i < dr->dr_bio_count; i++) {
 		if (dr->dr_bio[i])
 			vdev_submit_bio(dr->dr_bio[i]);
 	}
 
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
 	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
 /* ========== */
 
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
 #ifdef HAVE_1ARG_BIO_END_IO_T
 	zio->io_error = BIO_END_IO_ERROR(bio);
 #else
 	zio->io_error = -error;
 #endif
 
 	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
 		zio->io_vd->vdev_nowritecache = B_TRUE;
 
 	bio_put(bio);
 	ASSERT3S(zio->io_error, >=, 0);
 	if (zio->io_error)
 		vdev_disk_error(zio);
 	zio_interrupt(zio);
 }
 
 static int
 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 {
 	struct request_queue *q;
 	struct bio *bio;
 
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return (SET_ERROR(ENXIO));
 
 	bio = vdev_bio_alloc(bdev, GFP_NOIO, 0);
 	if (unlikely(bio == NULL))
 		return (SET_ERROR(ENOMEM));
 
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio_set_flush(bio);
 	vdev_submit_bio(bio);
 	invalidate_bdev(bdev);
 
 	return (0);
 }
 
 BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error)
 {
 	zio_t *zio = bio->bi_private;
 #ifdef HAVE_1ARG_BIO_END_IO_T
 	zio->io_error = BIO_END_IO_ERROR(bio);
 #else
 	zio->io_error = -error;
 #endif
 	bio_put(bio);
 	if (zio->io_error)
 		vdev_disk_error(zio);
 	zio_interrupt(zio);
 }
 
 /*
  * Wrappers for the different secure erase and discard APIs. We use async
  * when available; in this case, *biop is set to the last bio in the chain.
  */
 static int
 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
     sector_t nsect, struct bio **biop)
 {
 	*biop = NULL;
 	int error;
 
 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
 	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
 #else
 #error "unsupported kernel"
 #endif
 
 	return (error);
 }
 
 static int
 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
     sector_t nsect, struct bio **biop)
 {
 	*biop = NULL;
 	int error;
 
 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, 0, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, 0);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS);
 #else
 #error "unsupported kernel"
 #endif
 
 	return (error);
 }
 
 /*
  * Entry point for TRIM ops. This calls the right wrapper for secure erase or
  * discard, and then does the appropriate finishing work for error vs success
  * and async vs sync.
  */
 static int
 vdev_disk_io_trim(zio_t *zio)
 {
 	int error;
 	struct bio *bio;
 
 	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
 	sector_t sector = zio->io_offset >> 9;
 	sector_t nsects = zio->io_size >> 9;
 
 	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
 		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
 	else
 		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
 
 	if (error != 0)
 		return (SET_ERROR(-error));
 
 	if (bio == NULL) {
 		/*
 		 * This was a synchronous op that completed successfully, so
 		 * return it to ZFS immediately.
 		 */
 		zio_interrupt(zio);
 	} else {
 		/*
 		 * This was an asynchronous op; set up completion callback and
 		 * issue it.
 		 */
 		bio->bi_private = zio;
 		bio->bi_end_io = vdev_disk_discard_end_io;
 		vdev_submit_bio(bio);
 	}
 
 	return (0);
 }
 
 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
 
 static void
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vd == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	rw_enter(&vd->vd_lock, RW_READER);
 
 	/*
 	 * If the vdev is closed, it's likely due to a failed reopen and is
 	 * in the UNAVAIL state.  Nothing to be done here but return failure.
 	 */
 	if (vd->vd_bdh == NULL) {
 		rw_exit(&vd->vd_lock);
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_IOCTL:
 
 		if (!vdev_readable(v)) {
-			rw_exit(&vd->vd_lock);
-			zio->io_error = SET_ERROR(ENXIO);
-			zio_interrupt(zio);
-			return;
-		}
-
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			if (v->vdev_nowritecache) {
-				zio->io_error = SET_ERROR(ENOTSUP);
-				break;
-			}
-
+			/* Drive not there, can't flush */
+			error = SET_ERROR(ENXIO);
+		} else if (zfs_nocacheflush) {
+			/* Flushing disabled by operator, declare success */
+			error = 0;
+		} else if (v->vdev_nowritecache) {
+			/* This vdev not capable of flushing */
+			error = SET_ERROR(ENOTSUP);
+		} else {
+			/*
+			 * Issue the flush. If successful, the response will
+			 * be handled in the completion callback, so we're done.
+			 */
 			error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
 			if (error == 0) {
 				rw_exit(&vd->vd_lock);
 				return;
 			}
-
-			zio->io_error = error;
-
-			break;
-
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
+		/* Couldn't issue the flush, so set the error and return it */
 		rw_exit(&vd->vd_lock);
+		zio->io_error = error;
 		zio_execute(zio);
 		return;
 
 	case ZIO_TYPE_TRIM:
 		error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
 			zio_execute(zio);
 		}
 		return;
 
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
 		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
 			zio_interrupt(zio);
 		}
 		return;
 
 	default:
 		/*
 		 * Getting here means our parent vdev has made a very strange
 		 * request of us, and shouldn't happen. Assert here to force a
 		 * crash in dev builds, but in production return the IO
 		 * unhandled. The pool will likely suspend anyway but that's
 		 * nicer than crashing the kernel.
 		 */
 		ASSERT3S(zio->io_type, ==, -1);
 
 		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
 
 	__builtin_unreachable();
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {
 	/*
 	 * If the device returned EIO, we revalidate the media.  If it is
 	 * determined the media has changed this triggers the asynchronous
 	 * removal of the device from the configuration.
 	 */
 	if (zio->io_error == EIO) {
 		vdev_t *v = zio->io_vd;
 		vdev_disk_t *vd = v->vdev_tsd;
 
 		if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
 			invalidate_bdev(BDH_BDEV(vd->vd_bdh));
 			v->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
 	}
 }
 
 static void
 vdev_disk_hold(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* We must have a pathname, and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
 		return;
 
 	/*
 	 * Only prefetch path and devid info if the device has
 	 * never been opened.
 	 */
 	if (vd->vdev_tsd != NULL)
 		return;
 
 }
 
 static void
 vdev_disk_rele(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
 /*
  * BIO submission method. See comment above about vdev_classic.
  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
  */
 static uint_t zfs_vdev_disk_classic = 0;	/* default new */
 
 /* Set submission function from module parameter */
 static int
 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
 {
 	int err = param_set_uint(buf, kp);
 	if (err < 0)
 		return (SET_ERROR(err));
 
 	vdev_disk_io_rw_fn =
 	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
 
 	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
 	    zfs_vdev_disk_classic ? "classic" : "new");
 
 	return (0);
 }
 
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
  */
 static int
 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	(void) spa;
 	(void) nv;
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
 		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
 		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_disk_io_start,
 	.vdev_op_io_done = vdev_disk_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_disk_hold,
 	.vdev_op_rele = vdev_disk_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE,			/* leaf vdev */
 	.vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
 };
 
 /*
  * The zfs_vdev_scheduler module option has been deprecated. Setting this
  * value no longer has any effect.  It has not yet been entirely removed
  * to allow the module to be loaded if this option is specified in the
  * /etc/modprobe.d/zfs.conf file.  The following warning will be logged.
  */
 static int
 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
 {
 	int error = param_set_charp(val, kp);
 	if (error == 0) {
 		printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
 		    "is not supported.\n");
 	}
 
 	return (error);
 }
 
 static const char *zfs_vdev_scheduler = "unused";
 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
     param_get_charp, &zfs_vdev_scheduler, 0644);
 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
 
 int
 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint_t val;
 	int error;
 
 	error = kstrtouint(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_uint(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 int
 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint_t val;
 	int error;
 
 	error = kstrtouint(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_uint(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 	"Timeout before determining that a device is missing");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
     vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
 	"Use classic BIO submission method");
diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
index 5abc0426d1a7..2b483c9a9fa4 100644
--- a/module/os/linux/zfs/vdev_file.c
+++ b/module/os/linux/zfs/vdev_file.c
@@ -1,382 +1,376 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
 #include <sys/fcntl.h>
 #include <sys/vnode.h>
 #include <sys/zfs_file.h>
 #ifdef _KERNEL
 #include <linux/falloc.h>
 #endif
 /*
  * Virtual device vector for files.
  */
 
 static taskq_t *vdev_file_taskq;
 
 /*
  * By default, the logical/physical ashift for file vdevs is set to
  * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
  * blocksizes. Users may opt to change one or both of these for testing
  * or performance reasons. Care should be taken as these values will
  * impact the vdev_ashift setting which can only be set at vdev creation
  * time.
  */
 static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
 static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
 
 static void
 vdev_file_hold(vdev_t *vd)
 {
 	ASSERT(vd->vdev_path != NULL);
 }
 
 static void
 vdev_file_rele(vdev_t *vd)
 {
 	ASSERT(vd->vdev_path != NULL);
 }
 
 static mode_t
 vdev_file_open_mode(spa_mode_t spa_mode)
 {
 	mode_t mode = 0;
 
 	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
 		mode = O_RDWR;
 	} else if (spa_mode & SPA_MODE_READ) {
 		mode = O_RDONLY;
 	} else if (spa_mode & SPA_MODE_WRITE) {
 		mode = O_WRONLY;
 	}
 
 	return (mode | O_LARGEFILE);
 }
 
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_file_t *vf;
 	zfs_file_t *fp;
 	zfs_file_attr_t zfa;
 	int error;
 
 	/*
 	 * Rotational optimizations only make sense on block devices.
 	 */
 	vd->vdev_nonrot = B_TRUE;
 
 	/*
 	 * Allow TRIM on file based vdevs.  This may not always be supported,
 	 * since it depends on your kernel version and underlying filesystem
 	 * type but it is always safe to attempt.
 	 */
 	vd->vdev_has_trim = B_TRUE;
 
 	/*
 	 * Disable secure TRIM on file based vdevs.  There is no way to
 	 * request this behavior from the underlying filesystem.
 	 */
 	vd->vdev_has_securetrim = B_FALSE;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it's not currently open.  Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		vf = vd->vdev_tsd;
 		goto skip_open;
 	}
 
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
 	/*
 	 * We always open the files from the root of the global zone, even if
 	 * we're in a local zone.  If the user has gotten to this point, the
 	 * administrator has already decided that the pool should be available
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
 
 	error = zfs_file_open(vd->vdev_path,
 	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	vf->vf_file = fp;
 
 #ifdef _KERNEL
 	/*
 	 * Make sure it's a regular file.
 	 */
 	if (zfs_file_getattr(fp, &zfa)) {
 		return (SET_ERROR(ENODEV));
 	}
 	if (!S_ISREG(zfa.zfa_mode)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (SET_ERROR(ENODEV));
 	}
 #endif
 
 skip_open:
 
 	error =  zfs_file_getattr(vf->vf_file, &zfa);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	*max_psize = *psize = zfa.zfa_size;
 	*logical_ashift = vdev_file_logical_ashift;
 	*physical_ashift = vdev_file_physical_ashift;
 
 	return (0);
 }
 
 static void
 vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (vd->vdev_reopening || vf == NULL)
 		return;
 
 	if (vf->vf_file != NULL) {
 		(void) zfs_file_close(vf->vf_file);
 	}
 
 	vd->vdev_delayed_close = B_FALSE;
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
 static void
 vdev_file_io_strategy(void *arg)
 {
 	zio_t *zio = (zio_t *)arg;
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 	ssize_t resid;
 	void *buf;
 	loff_t off;
 	ssize_t size;
 	int err;
 
 	off = zio->io_offset;
 	size = zio->io_size;
 	resid = 0;
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
 		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = SET_ERROR(ENOSPC);
 
 	zio_delay_interrupt(zio);
 }
 
 static void
 vdev_file_io_fsync(void *arg)
 {
 	zio_t *zio = (zio_t *)arg;
 	vdev_file_t *vf = zio->io_vd->vdev_tsd;
 
 	zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
 
 	zio_interrupt(zio);
 }
 
 static void
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
-		switch (zio->io_cmd) {
-		case DKIOCFLUSHWRITECACHE:
-
-			if (zfs_nocacheflush)
-				break;
-
-			/*
-			 * We cannot safely call vfs_fsync() when PF_FSTRANS
-			 * is set in the current context.  Filesystems like
-			 * XFS include sanity checks to verify it is not
-			 * already set, see xfs_vm_writepage().  Therefore
-			 * the sync must be dispatched to a different context.
-			 */
-			if (__spl_pf_fstrans_check()) {
-				VERIFY3U(taskq_dispatch(vdev_file_taskq,
-				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
-				    TASKQID_INVALID);
-				return;
-			}
-
-			zio->io_error = zfs_file_fsync(vf->vf_file,
-			    O_SYNC | O_DSYNC);
-			break;
-		default:
-			zio->io_error = SET_ERROR(ENOTSUP);
+		if (zfs_nocacheflush) {
+			zio_execute(zio);
+			return;
 		}
 
+		/*
+		 * We cannot safely call vfs_fsync() when PF_FSTRANS
+		 * is set in the current context.  Filesystems like
+		 * XFS include sanity checks to verify it is not
+		 * already set, see xfs_vm_writepage().  Therefore
+		 * the sync must be dispatched to a different context.
+		 */
+		if (__spl_pf_fstrans_check()) {
+			VERIFY3U(taskq_dispatch(vdev_file_taskq,
+			    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+			    TASKQID_INVALID);
+			return;
+		}
+
+		zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
 		zio_execute(zio);
 		return;
 	} else if (zio->io_type == ZIO_TYPE_TRIM) {
 		int mode = 0;
 
 		ASSERT3U(zio->io_size, !=, 0);
 #ifdef __linux__
 		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 #endif
 		zio->io_error = zfs_file_fallocate(vf->vf_file,
 		    mode, zio->io_offset, zio->io_size);
 		zio_execute(zio);
 		return;
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
 	    TQ_SLEEP), !=, TASKQID_INVALID);
 }
 
 static void
 vdev_file_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 vdev_ops_t vdev_file_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 void
 vdev_file_init(void)
 {
 	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
 	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
 
 	VERIFY(vdev_file_taskq);
 }
 
 void
 vdev_file_fini(void)
 {
 	taskq_destroy(vdev_file_taskq);
 }
 
 /*
  * From userland we access disks just like files.
  */
 #ifndef _KERNEL
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 #endif
 
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
 	"Logical ashift for file-based devices");
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
 	"Physical ashift for file-based devices");
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index ec961255fd64..7769ed6a377a 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1,2825 +1,2821 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2018 Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/abd.h>
 #include <sys/zio.h>
 #include <sys/nvpair.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <zfs_fletcher.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_draid_io_verify() */
 #endif
 
 /*
  * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
  * comprised of multiple raidz redundancy groups which are spread over the
  * dRAID children. To ensure an even distribution, and avoid hot spots, a
  * permutation mapping is applied to the order of the dRAID children.
  * This mixing effectively distributes the parity columns evenly over all
  * of the disks in the dRAID.
  *
  * This is beneficial because it means when resilvering all of the disks
  * can participate thereby increasing the available IOPs and bandwidth.
  * Furthermore, by reserving a small fraction of each child's total capacity
  * virtual distributed spare disks can be created. These spares similarly
  * benefit from the performance gains of spanning all of the children. The
  * consequence of which is that resilvering to a distributed spare can
  * substantially reduce the time required to restore full parity to pool
  * with a failed disks.
  *
  * === dRAID group layout ===
  *
  * First, let's define a "row" in the configuration to be a 16M chunk from
  * each physical drive at the same offset. This is the minimum allowable
  * size since it must be possible to store a full 16M block when there is
  * only a single data column. Next, we define a "group" to be a set of
  * sequential disks containing both the parity and data columns. We allow
  * groups to span multiple rows in order to align any group size to any
  * number of physical drives. Finally, a "slice" is comprised of the rows
  * which contain the target number of groups. The permutation mappings
  * are applied in a round robin fashion to each slice.
  *
  * Given D+P drives in a group (including parity drives) and C-S physical
  * drives (not including the spare drives), we can distribute the groups
  * across R rows without remainder by selecting the least common multiple
  * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
  *
  * In the example below, there are C=14 physical drives in the configuration
  * with S=2 drives worth of spare capacity. Each group has a width of 9
  * which includes D=8 data and P=1 parity drive. There are 4 groups and
  * 3 rows per slice.  Each group has a size of 144M (16M * 9) and a slice
  * size is 576M (144M * 4). When allocating from a dRAID each group is
  * filled before moving on to the next as show in slice0 below.
  *
  *             data disks (8 data + 1 parity)          spares (2)
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  ^  | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
  *  |  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  |  |              group 0              |  group 1..|       |
  *  |  +-----------------------------------+-----------+-------|
  *  |  | 0   1   2   3   4   5   6   7   8 | 36  37  38|       |  r
  *  |  | 9   10  11  12  13  14  15  16  17| 45  46  47|       |  o
  *  |  | 18  19  20  21  22  23  24  25  26| 54  55  56|       |  w
  *     | 27  28  29  30  31  32  33  34  35| 63  64  65|       |  0
  *  s  +-----------------------+-----------------------+-------+
  *  l  |       ..group 1       |        group 2..      |       |
  *  i  +-----------------------+-----------------------+-------+
  *  c  | 39  40  41  42  43  44| 72  73  74  75  76  77|       |  r
  *  e  | 48  49  50  51  52  53| 81  82  83  84  85  86|       |  o
  *  0  | 57  58  59  60  61  62| 90  91  92  93  94  95|       |  w
  *     | 66  67  68  69  70  71| 99 100 101 102 103 104|       |  1
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  |..group 2  |            group 3                |       |
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  | 78  79  80|108 109 110 111 112 113 114 115 116|       |  r
  *  |  | 87  88  89|117 118 119 120 121 122 123 124 125|       |  o
  *  |  | 96  97  98|126 127 128 129 130 131 132 133 134|       |  w
  *  v  |105 106 107|135 136 137 138 139 140 141 142 143|       |  2
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 4              |  group 5..|       | row 3
  *  i  +-----------------------+-----------+-----------+-------|
  *  c  |       ..group 5       |        group 6..      |       | row 4
  *  e  +-----------+-----------+-----------------------+-------+
  *  1  |..group 6  |            group 7                |       | row 5
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 8              |  group 9..|       | row 6
  *  i  +-----------------------------------------------+-------|
  *  c  |       ..group 9       |        group 10..     |       | row 7
  *  e  +-----------------------+-----------------------+-------+
  *  2  |..group 10 |            group 11               |       | row 8
  *     +-----------+-----------------------------------+-------+
  *
  * This layout has several advantages over requiring that each row contain
  * a whole number of groups.
  *
  * 1. The group count is not a relevant parameter when defining a dRAID
  *    layout. Only the group width is needed, and *all* groups will have
  *    the desired size.
  *
  * 2. All possible group widths (<= physical disk count) can be supported.
  *
  * 3. The logic within vdev_draid.c is simplified when the group width is
  *    the same for all groups (although some of the logic around computing
  *    permutation numbers and drive offsets is more complicated).
  *
  * N.B. The following array describes all valid dRAID permutation maps.
  * Each row is used to generate a permutation map for a different number
  * of children from a unique seed. The seeds were generated and carefully
  * evaluated by the 'draid' utility in order to provide balanced mappings.
  * In addition to the seed a checksum of the in-memory mapping is stored
  * for verification.
  *
  * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
  * with a given permutation map) is the ratio of the amounts of I/O that will
  * be sent to the least and most busy disks when resilvering. The average
  * imbalance ratio (of a given number of disks and permutation map) is the
  * average of the ratios of all possible single and double disk failures.
  *
  * In order to achieve a low imbalance ratio the number of permutations in
  * the mapping must be significantly larger than the number of children.
  * For dRAID the number of permutations has been limited to 512 to minimize
  * the map size. This does result in a gradually increasing imbalance ratio
  * as seen in the table below. Increasing the number of permutations for
  * larger child counts would reduce the imbalance ratio. However, in practice
  * when there are a large number of children each child is responsible for
  * fewer total IOs so it's less of a concern.
  *
  * Note these values are hard coded and must never be changed.  Existing
  * pools depend on the same mapping always being generated in order to
  * read and write from the correct locations.  Any change would make
  * existing pools completely inaccessible.
  */
 static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
 	{   2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d },	/* 1.000 */
 	{   3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 },	/* 1.000 */
 	{   4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 },	/* 1.000 */
 	{   5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 },	/* 1.010 */
 	{   6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 },	/* 1.031 */
 	{   7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee },	/* 1.043 */
 	{   8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 },	/* 1.059 */
 	{   9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 },	/* 1.056 */
 	{  10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 },	/* 1.072 */
 	{  11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c },	/* 1.083 */
 	{  12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e },	/* 1.097 */
 	{  13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 },	/* 1.100 */
 	{  14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 },	/* 1.121 */
 	{  15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 },	/* 1.103 */
 	{  16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 },	/* 1.111 */
 	{  17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe },	/* 1.133 */
 	{  18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 },	/* 1.131 */
 	{  19, 256, 0x892e343f2f31d690, 0x00000029eb392835 },	/* 1.130 */
 	{  20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c },	/* 1.141 */
 	{  21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 },	/* 1.139 */
 	{  22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 },	/* 1.150 */
 	{  23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f },	/* 1.174 */
 	{  24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 },	/* 1.168 */
 	{  25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 },	/* 1.180 */
 	{  26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba },	/* 1.226 */
 	{  27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 },	/* 1.228 */
 	{  28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c },	/* 1.217 */
 	{  29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c },	/* 1.239 */
 	{  30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 },	/* 1.238 */
 	{  31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f },	/* 1.273 */
 	{  32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 },	/* 1.191 */
 	{  33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 },	/* 1.199 */
 	{  34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 },	/* 1.195 */
 	{  35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 },	/* 1.201 */
 	{  36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef },	/* 1.194 */
 	{  37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 },	/* 1.237 */
 	{  38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 },	/* 1.242 */
 	{  39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd },	/* 1.231 */
 	{  40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 },	/* 1.233 */
 	{  41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 },	/* 1.271 */
 	{  42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 },	/* 1.263 */
 	{  43, 512, 0xbaa5125faa781854, 0x000001c76789e278 },	/* 1.270 */
 	{  44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb },	/* 1.281 */
 	{  45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 },	/* 1.282 */
 	{  46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b },	/* 1.286 */
 	{  47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 },	/* 1.329 */
 	{  48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b },	/* 1.286 */
 	{  49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 },	/* 1.322 */
 	{  50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 },	/* 1.335 */
 	{  51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 },	/* 1.305 */
 	{  52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf },	/* 1.330 */
 	{  53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 },	/* 1.365 */
 	{  54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 },	/* 1.334 */
 	{  55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 },	/* 1.364 */
 	{  56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e },	/* 1.374 */
 	{  57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 },	/* 1.363 */
 	{  58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 },	/* 1.401 */
 	{  59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c },	/* 1.392 */
 	{  60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 },	/* 1.360 */
 	{  61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd },	/* 1.396 */
 	{  62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c },	/* 1.453 */
 	{  63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 },	/* 1.437 */
 	{  64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 },	/* 1.402 */
 	{  65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 },	/* 1.459 */
 	{  66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 },	/* 1.423 */
 	{  67, 512, 0x910b9714f698a877, 0x00000451ea65d5db },	/* 1.447 */
 	{  68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 },	/* 1.450 */
 	{  69, 512, 0x836d4968fbaa3706, 0x000004954068a380 },	/* 1.455 */
 	{  70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d },	/* 1.463 */
 	{  71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 },	/* 1.463 */
 	{  72, 512, 0x42763a680d5bed8e, 0x000005084275c680 },	/* 1.452 */
 	{  73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab },	/* 1.498 */
 	{  74, 512, 0x9fa08548b1621a44, 0x0000054708019247 },	/* 1.526 */
 	{  75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 },	/* 1.491 */
 	{  76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 },	/* 1.470 */
 	{  77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 },	/* 1.527 */
 	{  78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 },	/* 1.509 */
 	{  79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e },	/* 1.569 */
 	{  80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c },	/* 1.555 */
 	{  81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 },	/* 1.509 */
 	{  82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 },	/* 1.596 */
 	{  83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e },	/* 1.568 */
 	{  84, 512, 0xba02545069ddc6dc, 0x000006d19861364f },	/* 1.541 */
 	{  85, 512, 0x447c73192c35073e, 0x000006fce315ce35 },	/* 1.623 */
 	{  86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b },	/* 1.620 */
 	{  87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 },	/* 1.597 */
 	{  88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b },	/* 1.575 */
 	{  89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc },	/* 1.627 */
 	{  90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb },	/* 1.596 */
 	{  91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 },	/* 1.622 */
 	{  92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e },	/* 1.695 */
 	{  93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c },	/* 1.605 */
 	{  94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc },	/* 1.625 */
 	{  95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 },	/* 1.687 */
 	{  96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a },	/* 1.621 */
 	{  97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 },	/* 1.699 */
 	{  98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b },	/* 1.688 */
 	{  99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce },	/* 1.642 */
 	{ 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc },	/* 1.683 */
 	{ 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 },	/* 1.755 */
 	{ 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 },	/* 1.692 */
 	{ 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 },	/* 1.747 */
 	{ 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 },	/* 1.751 */
 	{ 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 },	/* 1.751 */
 	{ 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f },	/* 1.726 */
 	{ 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d },	/* 1.788 */
 	{ 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 },	/* 1.740 */
 	{ 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 },	/* 1.780 */
 	{ 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 },	/* 1.836 */
 	{ 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 },	/* 1.778 */
 	{ 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 },	/* 1.831 */
 	{ 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df },	/* 1.825 */
 	{ 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 },	/* 1.826 */
 	{ 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 },	/* 1.843 */
 	{ 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d },	/* 1.826 */
 	{ 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b },	/* 1.803 */
 	{ 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 },	/* 1.857 */
 	{ 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 },	/* 1.877 */
 	{ 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 },	/* 1.849 */
 	{ 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d },	/* 1.867 */
 	{ 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 },	/* 1.978 */
 	{ 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d },	/* 1.947 */
 	{ 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea },	/* 1.865 */
 	{ 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f },	/* 1.881 */
 	{ 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b },	/* 1.882 */
 	{ 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e },	/* 1.867 */
 	{ 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e },	/* 1.972 */
 	{ 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 },	/* 1.896 */
 	{ 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d },	/* 1.965 */
 	{ 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 },	/* 1.963 */
 	{ 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 },	/* 1.925 */
 	{ 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 },	/* 1.862 */
 	{ 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 },	/* 2.042 */
 	{ 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 },	/* 1.935 */
 	{ 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 },	/* 2.005 */
 	{ 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c },	/* 2.041 */
 	{ 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 },	/* 1.997 */
 	{ 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 },	/* 1.996 */
 	{ 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d },	/* 2.053 */
 	{ 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a },	/* 1.971 */
 	{ 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 },	/* 2.018 */
 	{ 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd },	/* 1.961 */
 	{ 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 },	/* 2.046 */
 	{ 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb },	/* 1.968 */
 	{ 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 },	/* 2.143 */
 	{ 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 },	/* 2.064 */
 	{ 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 },	/* 2.023 */
 	{ 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c },	/* 2.136 */
 	{ 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 },	/* 2.063 */
 	{ 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 },	/* 1.974 */
 	{ 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 },	/* 2.210 */
 	{ 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a },	/* 2.006 */
 	{ 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 },	/* 2.193 */
 	{ 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 },	/* 2.163 */
 	{ 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc },	/* 2.046 */
 	{ 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 },	/* 2.084 */
 	{ 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 },	/* 2.264 */
 	{ 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 },	/* 2.074 */
 	{ 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 },	/* 2.282 */
 	{ 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf },	/* 2.148 */
 	{ 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 },	/* 2.355 */
 	{ 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 },	/* 2.164 */
 	{ 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a },	/* 2.393 */
 	{ 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 },	/* 2.178 */
 	{ 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc },	/* 2.334 */
 	{ 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b },	/* 2.266 */
 	{ 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 },	/* 2.304 */
 	{ 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d },	/* 2.218 */
 	{ 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff },	/* 2.377 */
 	{ 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 },	/* 2.155 */
 	{ 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 },	/* 2.404 */
 	{ 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 },	/* 2.205 */
 	{ 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d },	/* 2.359 */
 	{ 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 },	/* 2.158 */
 	{ 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b },	/* 2.614 */
 	{ 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc },	/* 2.239 */
 	{ 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc },	/* 2.493 */
 	{ 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c },	/* 2.327 */
 	{ 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 },	/* 2.231 */
 	{ 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c },	/* 2.237 */
 	{ 182, 512, 0xe6035defea48f933, 0x00002038e3346658 },	/* 2.691 */
 	{ 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e },	/* 2.170 */
 	{ 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 },	/* 2.600 */
 	{ 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc },	/* 2.391 */
 	{ 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 },	/* 2.677 */
 	{ 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c },	/* 2.410 */
 	{ 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 },	/* 2.776 */
 	{ 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 },	/* 2.266 */
 	{ 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 },	/* 2.717 */
 	{ 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c },	/* 2.474 */
 	{ 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 },	/* 2.673 */
 	{ 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 },	/* 2.420 */
 	{ 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 },	/* 2.898 */
 	{ 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c },	/* 2.363 */
 	{ 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e },	/* 2.747 */
 	{ 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 },	/* 2.531 */
 	{ 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 },	/* 2.707 */
 	{ 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 },	/* 2.315 */
 	{ 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf },	/* 3.012 */
 	{ 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 },	/* 2.378 */
 	{ 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 },	/* 2.969 */
 	{ 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d },	/* 2.594 */
 	{ 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd },	/* 2.763 */
 	{ 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 },	/* 2.457 */
 	{ 206, 512, 0xc02fc96684715a16, 0x0000297515608601 },	/* 3.057 */
 	{ 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 },	/* 2.590 */
 	{ 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b },	/* 3.047 */
 	{ 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 },	/* 2.676 */
 	{ 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 },	/* 2.993 */
 	{ 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 },	/* 2.457 */
 	{ 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 },	/* 3.182 */
 	{ 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 },	/* 2.563 */
 	{ 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 },	/* 3.025 */
 	{ 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f },	/* 2.730 */
 	{ 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 },	/* 3.036 */
 	{ 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 },	/* 2.722 */
 	{ 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 },	/* 3.356 */
 	{ 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 },	/* 2.697 */
 	{ 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 },	/* 2.979 */
 	{ 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 },	/* 2.858 */
 	{ 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e },	/* 3.258 */
 	{ 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 },	/* 2.693 */
 	{ 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 },	/* 3.259 */
 	{ 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c },	/* 2.733 */
 	{ 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 },	/* 3.235 */
 	{ 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 },	/* 2.983 */
 	{ 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e },	/* 3.308 */
 	{ 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 },	/* 2.715 */
 	{ 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f },	/* 3.540 */
 	{ 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 },	/* 2.779 */
 	{ 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c },	/* 3.084 */
 	{ 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc },	/* 2.987 */
 	{ 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae },	/* 3.341 */
 	{ 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 },	/* 2.793 */
 	{ 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 },	/* 3.518 */
 	{ 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 },	/* 2.962 */
 	{ 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 },	/* 3.196 */
 	{ 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 },	/* 2.914 */
 	{ 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 },	/* 3.408 */
 	{ 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 },	/* 2.903 */
 	{ 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 },	/* 3.778 */
 	{ 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c },	/* 3.026 */
 	{ 244, 512, 0xc740263f0301efa8, 0x00003a147146512d },	/* 3.347 */
 	{ 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d },	/* 3.212 */
 	{ 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 },	/* 3.482 */
 	{ 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 },	/* 3.146 */
 	{ 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f },	/* 3.626 */
 	{ 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 },	/* 2.952 */
 	{ 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e },	/* 3.463 */
 	{ 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 },	/* 3.131 */
 	{ 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c },	/* 3.538 */
 	{ 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac },	/* 2.974 */
 	{ 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 },	/* 3.843 */
 	{ 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 },	/* 3.088 */
 };
 
 /*
  * Verify the map is valid. Each device index must appear exactly
  * once in every row, and the permutation array checksum must match.
  */
 static int
 verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
     uint64_t checksum)
 {
 	int countssz = sizeof (uint16_t) * children;
 	uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
 
 	for (int i = 0; i < nperms; i++) {
 		for (int j = 0; j < children; j++) {
 			uint8_t val = perms[(i * children) + j];
 
 			if (val >= children || counts[val] != i) {
 				kmem_free(counts, countssz);
 				return (EINVAL);
 			}
 
 			counts[val]++;
 		}
 	}
 
 	if (checksum != 0) {
 		int permssz = sizeof (uint8_t) * children * nperms;
 		zio_cksum_t cksum;
 
 		fletcher_4_native_varsize(perms, permssz, &cksum);
 
 		if (checksum != cksum.zc_word[0]) {
 			kmem_free(counts, countssz);
 			return (ECKSUM);
 		}
 	}
 
 	kmem_free(counts, countssz);
 
 	return (0);
 }
 
 /*
  * Generate the permutation array for the draid_map_t.  These maps control
  * the placement of all data in a dRAID.  Therefore it's critical that the
  * seed always generates the same mapping.  We provide our own pseudo-random
  * number generator for this purpose.
  */
 int
 vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
 {
 	VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
 	VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
 	VERIFY3U(map->dm_seed, !=, 0);
 	VERIFY3U(map->dm_nperms, !=, 0);
 	VERIFY3P(map->dm_perms, ==, NULL);
 
 #ifdef _KERNEL
 	/*
 	 * The kernel code always provides both a map_seed and checksum.
 	 * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
 	 * a zero checksum when generating new candidate maps.
 	 */
 	VERIFY3U(map->dm_checksum, !=, 0);
 #endif
 	uint64_t children = map->dm_children;
 	uint64_t nperms = map->dm_nperms;
 	int rowsz = sizeof (uint8_t) * children;
 	int permssz = rowsz * nperms;
 	uint8_t *perms;
 
 	/* Allocate the permutation array */
 	perms = vmem_alloc(permssz, KM_SLEEP);
 
 	/* Setup an initial row with a known pattern */
 	uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
 	for (int i = 0; i < children; i++)
 		initial_row[i] = i;
 
 	uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
 	uint8_t *current_row, *previous_row = initial_row;
 
 	/*
 	 * Perform a Fisher-Yates shuffle of each row using the previous
 	 * row as the starting point.  An initial_row with known pattern
 	 * is used as the input for the first row.
 	 */
 	for (int i = 0; i < nperms; i++) {
 		current_row = &perms[i * children];
 		memcpy(current_row, previous_row, rowsz);
 
 		for (int j = children - 1; j > 0; j--) {
 			uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
 			uint8_t val = current_row[j];
 			current_row[j] = current_row[k];
 			current_row[k] = val;
 		}
 
 		previous_row = current_row;
 	}
 
 	kmem_free(initial_row, rowsz);
 
 	int error = verify_perms(perms, children, nperms, map->dm_checksum);
 	if (error) {
 		vmem_free(perms, permssz);
 		return (error);
 	}
 
 	*permsp = perms;
 
 	return (0);
 }
 
 /*
  * Lookup the fixed draid_map_t for the requested number of children.
  */
 int
 vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
 {
 	for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
 		if (draid_maps[i].dm_children == children) {
 			*mapp = &draid_maps[i];
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Lookup the permutation array and iteration id for the provided offset.
  */
 static void
 vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
     uint8_t **base, uint64_t *iter)
 {
 	uint64_t ncols = vdc->vdc_children;
 	uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
 
 	*base = vdc->vdc_perms + (poff / ncols) * ncols;
 	*iter = poff % ncols;
 }
 
 static inline uint64_t
 vdev_draid_permute_id(vdev_draid_config_t *vdc,
     uint8_t *base, uint64_t iter, uint64_t index)
 {
 	return ((base[index] + iter) % vdc->vdc_children);
 }
 
 /*
  * Return the asize which is the psize rounded up to a full group width.
  * i.e. vdev_draid_psize_to_asize().
  */
 static uint64_t
 vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	(void) txg;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_ashift;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
 	uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
 
 	ASSERT3U(asize, !=, 0);
 	ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
 
 	return (asize);
 }
 
 /*
  * Deflate the asize to the psize, this includes stripping parity.
  */
 uint64_t
 vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT0(asize % vdc->vdc_groupwidth);
 
 	return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
 }
 
 /*
  * Convert a logical offset to the corresponding group number.
  */
 static uint64_t
 vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (offset / vdc->vdc_groupsz);
 }
 
 /*
  * Convert a group number to the logical starting offset for that group.
  */
 static uint64_t
 vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (group * vdc->vdc_groupsz);
 }
 
 /*
  * Full stripe writes.  When writing, all columns (D+P) are required.  Parity
  * is calculated over all the columns, including empty zero filled sectors,
  * and each is written to disk.  While only the data columns are needed for
  * a normal read, all of the columns are required for reconstruction when
  * performing a sequential resilver.
  *
  * For "big columns" it's sufficient to map the correct range of the zio ABD.
  * Partial columns require allocating a gang ABD in order to zero fill the
  * empty sectors.  When the column is empty a zero filled sector must be
  * mapped.  In all cases the data ABDs must be the same size as the parity
  * ABDs (e.g. rc->rc_size == parity_size).
  */
 static void
 vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small write), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			rc->rc_abd = abd_get_zeros(skip_size);
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
 			    B_TRUE);
 		}
 
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
 
 		abd_off += rc->rc_size;
 		rc->rc_size = parity_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Scrub/resilver reads.  In order to store the contents of the skip sectors
  * an additional ABD is allocated.  The columns are handled in the same way
  * as a full stripe write except instead of using the zero ABD the newly
  * allocated skip ABD is used to back the skip sectors.  In all cases the
  * data ABD must be the same size as the parity ABDs.
  */
 static void
 vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 		}
 
 		uint64_t abd_size = abd_get_size(rc->rc_abd);
 		ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 		/*
 		 * Increase rc_size so the skip ABD is included in subsequent
 		 * parity calculations.
 		 */
 		abd_off += rc->rc_size;
 		rc->rc_size = abd_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Normal reads.  In this common case only the columns containing data
  * are read in to the zio ABDs.  Neither the parity columns or empty skip
  * sectors are read unless the checksum fails verification.  In which case
  * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
  * the raid map in order to allow reconstruction using the parity data and
  * skip sectors.
  */
 static void
 vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size > 0) {
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 			abd_off += rc->rc_size;
 		}
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
  * difference is that an ABD is allocated to back skip sectors so they may
  * be read in to memory, verified, and repaired if needed.
  */
 void
 vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, ==, NULL);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column", nothing to add */
 			ASSERT3P(rc->rc_abd, !=, NULL);
 		} else {
 			/*
 			 * short data column, add a skip sector and clear
 			 * rc_tried to force the entire column to be re-read
 			 * thereby including the missing skip sector data
 			 * which is needed for reconstruction.
 			 */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, !=, NULL);
 			ASSERT(!abd_is_gang(rc->rc_abd));
 			abd_t *read_abd = rc->rc_abd;
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 			rc->rc_tried = 0;
 		}
 
 		/*
 		 * Increase rc_size so the empty ABD is included in subsequent
 		 * parity calculations.
 		 */
 		rc->rc_size = parity_size;
 	}
 
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Verify that all empty sectors are zero filled before using them to
  * calculate parity.  Otherwise, silent corruption in an empty sector will
  * result in bad parity being generated.  That bad parity will then be
  * considered authoritative and overwrite the good parity on disk.  This
  * is possible because the checksum is only calculated over the data,
  * thus it cannot be used to detect damage in empty sectors.
  */
 int
 vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = parity_size - skip_size;
 	uint64_t empty_off = 0;
 	int ret = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, !=, NULL);
 	ASSERT3U(rr->rr_bigcols, >, 0);
 
 	void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP);
 
 	for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		ASSERT3U(rc->rc_size, ==, parity_size);
 
 		if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off,
 		    skip_size) != 0) {
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			abd_zero_off(rc->rc_abd, skip_off, skip_size);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 
 		empty_off += skip_size;
 	}
 
 	ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty));
 
 	kmem_free(zero_buf, skip_size);
 
 	return (ret);
 }
 
 /*
  * Given a logical address within a dRAID configuration, return the physical
  * address on the first drive in the group that this address maps to
  * (at position 'start' in permutation number 'perm').
  */
 static uint64_t
 vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
     uint64_t *perm, uint64_t *start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	/* b is the dRAID (parent) sector offset. */
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t b_offset = logical_offset >> ashift;
 
 	/*
 	 * The height of a row in units of the vdev's minimum sector size.
 	 * This is the amount of data written to each disk of each group
 	 * in a given permutation.
 	 */
 	uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
 
 	/*
 	 * We cycle through a disk permutation every groupsz * ngroups chunk
 	 * of address space. Note that ngroups * groupsz must be a multiple
 	 * of the number of data drives (ndisks) in order to guarantee
 	 * alignment. So, for example, if our row height is 16MB, our group
 	 * size is 10, and there are 13 data drives in the draid, then ngroups
 	 * will be 13, we will change permutation every 2.08GB and each
 	 * disk will have 160MB of data per chunk.
 	 */
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t ngroups = vdc->vdc_ngroups;
 	uint64_t ndisks = vdc->vdc_ndisks;
 
 	/*
 	 * groupstart is where the group this IO will land in "starts" in
 	 * the permutation array.
 	 */
 	uint64_t group = logical_offset / vdc->vdc_groupsz;
 	uint64_t groupstart = (group * groupwidth) % ndisks;
 	ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
 	*start = groupstart;
 
 	/* b_offset is the sector offset within a group chunk */
 	b_offset = b_offset % (rowheight_sectors * groupwidth);
 	ASSERT0(b_offset % groupwidth);
 
 	/*
 	 * Find the starting byte offset on each child vdev:
 	 * - within a permutation there are ngroups groups spread over the
 	 *   rows, where each row covers a slice portion of the disk
 	 * - each permutation has (groupwidth * ngroups) / ndisks rows
 	 * - so each permutation covers rows * slice portion of the disk
 	 * - so we need to find the row where this IO group target begins
 	 */
 	*perm = group / ngroups;
 	uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
 	    (((group % ngroups) * groupwidth) / ndisks);
 
 	return (((rowheight_sectors * row) +
 	    (b_offset / groupwidth)) << ashift);
 }
 
 static uint64_t
 vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
     uint64_t abd_offset, uint64_t abd_size)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t io_size = abd_size;
 	uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
 	uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
 	uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
 
 	/*
 	 * Limit the io_size to the space remaining in the group.  A second
 	 * row in the raidz_map_t is created for the remainder.
 	 */
 	if (io_offset + io_asize > start_offset) {
 		io_size = vdev_draid_asize_to_psize(vd,
 		    start_offset - io_offset);
 	}
 
 	/*
 	 * At most a block may span the logical end of one group and the start
 	 * of the next group. Therefore, at the end of a group the io_size must
 	 * span the group width evenly and the remainder must be aligned to the
 	 * start of the next group.
 	 */
 	IMPLY(abd_offset == 0 && io_size < zio->io_size,
 	    (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
 	IMPLY(abd_offset != 0,
 	    vdev_draid_group_to_offset(vd, group) == io_offset);
 
 	/* Lookup starting byte offset on each child vdev */
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    io_offset, &perm, &groupstart);
 
 	/*
 	 * If there is less than groupwidth drives available after the group
 	 * start, the group is going to wrap onto the next row. 'wrap' is the
 	 * group disk number that starts on the next row.
 	 */
 	uint64_t ndisks = vdc->vdc_ndisks;
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t wrap = groupwidth;
 
 	if (groupstart + groupwidth > ndisks)
 		wrap = ndisks - groupstart;
 
 	/* The io size in units of the vdev's minimum sector size. */
 	const uint64_t psize = io_size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = psize / vdc->vdc_ndata;
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = psize - q * vdc->vdc_ndata;
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
 	ASSERT3U(bc, <, groupwidth);
 
 	/* The total number of data and parity sectors for this I/O. */
 	uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
 
 	ASSERT3U(vdc->vdc_nparity, >, 0);
 
 	raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth);
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = vdc->vdc_nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = io_offset;
 	rr->rr_size = io_size;
 #endif
 	*rrp = rr;
 
 	uint8_t *base;
 	uint64_t iter, asize = 0;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 	for (uint64_t i = 0; i < groupwidth; i++) {
 		raidz_col_t *rc = &rr->rr_col[i];
 		uint64_t c = (groupstart + i) % ndisks;
 
 		/* increment the offset if we wrap to the next row */
 		if (i == wrap)
 			physical_offset += VDEV_DRAID_ROWHEIGHT;
 
 		rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
 		rc->rc_offset = physical_offset;
 
 		if (q == 0 && i >= bc)
 			rc->rc_size = 0;
 		else if (i < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rr->rr_nempty = roundup(tot, groupwidth) - tot;
 	IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
 
 	/* Allocate buffers for the parity columns */
 	for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Map buffers for data columns and allocate/map buffers for skip
 	 * sectors.  There are three distinct cases for dRAID which are
 	 * required to support sequential rebuild.
 	 */
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_draid_map_alloc_write(zio, abd_offset, rr);
 	} else if ((rr->rr_nempty > 0) &&
 	    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 		vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
 	} else {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 		vdev_draid_map_alloc_read(zio, abd_offset, rr);
 	}
 
 	return (io_size);
 }
 
 /*
  * Allocate the raidz mapping to be applied to the dRAID I/O.  The parity
  * calculations for dRAID are identical to raidz however there are a few
  * differences in the layout.
  *
  * - dRAID always allocates a full stripe width. Any extra sectors due
  *   this padding are zero filled and written to disk. They will be read
  *   back during a scrub or repair operation since they are included in
  *   the parity calculation. This property enables sequential resilvering.
  *
  * - When the block at the logical offset spans redundancy groups then two
  *   rows are allocated in the raidz_map_t. One row resides at the end of
  *   the first group and the other at the start of the following group.
  */
 static raidz_map_t *
 vdev_draid_map_alloc(zio_t *zio)
 {
 	raidz_row_t *rr[2];
 	uint64_t abd_offset = 0;
 	uint64_t abd_size = zio->io_size;
 	uint64_t io_offset = zio->io_offset;
 	uint64_t size;
 	int nrows = 1;
 
 	size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
 	    abd_offset, abd_size);
 	if (size < abd_size) {
 		vdev_t *vd = zio->io_vd;
 
 		io_offset += vdev_draid_asize(vd, size, 0);
 		abd_offset += size;
 		abd_size -= size;
 		nrows++;
 
 		ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
 		    vd, vdev_draid_offset_to_group(vd, io_offset)));
 		ASSERT3U(abd_offset, <, zio->io_size);
 		ASSERT3U(abd_size, !=, 0);
 
 		size = vdev_draid_map_alloc_row(zio, &rr[1],
 		    io_offset, abd_offset, abd_size);
 		VERIFY3U(size, ==, abd_size);
 	}
 
 	raidz_map_t *rm;
 	rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
 	rm->rm_ops = vdev_raidz_math_get_ops();
 	rm->rm_nrows = nrows;
 	rm->rm_row[0] = rr[0];
 	if (nrows == 2)
 		rm->rm_row[1] = rr[1];
 	return (rm);
 }
 
 /*
  * Given an offset into a dRAID return the next group width aligned offset
  * which can be used to start an allocation.
  */
 static uint64_t
 vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
 }
 
 /*
  * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
  * rounded down to the last full slice.  So each child must provide at least
  * 1 / (children - nspares) of its asize.
  */
 static uint64_t
 vdev_draid_min_asize(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (VDEV_DRAID_REFLOW_RESERVE +
 	    (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
 }
 
 /*
  * When using dRAID the minimum allocation size is determined by the number
  * of data disks in the redundancy group.  Full stripes are always used.
  */
 static uint64_t
 vdev_draid_min_alloc(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (vdc->vdc_ndata << vd->vdev_ashift);
 }
 
 /*
  * Returns true if the txg range does not exist on any leaf vdev.
  *
  * A dRAID spare does not fit into the DTL model. While it has child vdevs
  * there is no redundancy among them, and the effective child vdev is
  * determined by offset. Essentially we do a vdev_dtl_reassess() on the
  * fly by replacing a dRAID spare with the child vdev under the offset.
  * Note that it is a recursive process because the child vdev can be
  * another dRAID spare and so on.
  */
 boolean_t
 vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child
 		 * contains the txg range the data it is not missing.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (!vdev_draid_missing(cvd, physical_offset,
 			    txg, size))
 				return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_missing(vd, physical_offset,
 		    txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Returns true if the txg is only partially replicated on the leaf vdevs.
  */
 static boolean_t
 vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child is
 		 * missing the txg range then it is partially replicated.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_partial(cvd, physical_offset, txg, size))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_partial(vd, physical_offset, txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Determine if the vdev is readable at the given offset.
  */
 boolean_t
 vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_readable(cvd, physical_offset))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	return (vdev_readable(vd));
 }
 
 /*
  * Returns the first distributed spare found under the provided vdev tree.
  */
 static vdev_t *
 vdev_draid_find_spare(vdev_t *vd)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
 		if (svd != NULL)
 			return (svd);
 	}
 
 	return (NULL);
 }
 
 /*
  * Returns B_TRUE if the passed in vdev is currently "faulted".
  * Faulted, in this context, means that the vdev represents a
  * replacing or sparing vdev tree.
  */
 static boolean_t
 vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 
 		/*
 		 * After resolving the distributed spare to a leaf vdev
 		 * check the parent to determine if it's "faulted".
 		 */
 		vd = vd->vdev_parent;
 	}
 
 	return (vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Determine if the dRAID block at the logical offset is degraded.
  * Used by sequential resilver.
  */
 static boolean_t
 vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Group contains a faulted vdev. */
 		if (vdev_draid_faulted(cvd, physical_offset))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Determine if the txg is missing.  Used by healing resilver.
  */
 static boolean_t
 vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
     uint64_t size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Transaction group is known to be partially replicated. */
 		if (vdev_draid_partial(cvd, physical_offset, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Find the smallest child asize and largest sector size to calculate the
  * available capacity.  Distributed spares are ignored since their capacity
  * is also based of the minimum child size in the top-level dRAID.
  */
 static void
 vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
     uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
 {
 	uint64_t logical_ashift = 0, physical_ashift = 0;
 	uint64_t asize = 0, max_asize = 0;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 
 		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
 		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 		physical_ashift = vdev_best_ashift(logical_ashift,
 		    physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asizep = asize;
 	*max_asizep = max_asize;
 	*logical_ashiftp = logical_ashift;
 	*physical_ashiftp = physical_ashift;
 }
 
 /*
  * Open spare vdevs.
  */
 static boolean_t
 vdev_draid_open_spares(vdev_t *vd)
 {
 	return (vd->vdev_ops == &vdev_draid_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Open all children, excluding spares.
  */
 static boolean_t
 vdev_draid_open_children(vdev_t *vd)
 {
 	return (!vdev_draid_open_spares(vd));
 }
 
 /*
  * Open a top-level dRAID vdev.
  */
 static int
 vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_config_t *vdc =  vd->vdev_tsd;
 	uint64_t nparity = vdc->vdc_nparity;
 	int open_errors = 0;
 
 	if (nparity > VDEV_DRAID_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * First open the normal children then the distributed spares.  This
 	 * ordering is important to ensure the distributed spares calculate
 	 * the correct psize in the event that the dRAID vdevs were expanded.
 	 */
 	vdev_open_children_subset(vd, vdev_draid_open_children);
 	vdev_open_children_subset(vd, vdev_draid_open_spares);
 
 	/* Verify enough of the children are available to continue. */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_open_error != 0) {
 			if ((++open_errors) > nparity) {
 				vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 				return (SET_ERROR(ENXIO));
 			}
 		}
 	}
 
 	/*
 	 * Allocatable capacity is the sum of the space on all children less
 	 * the number of distributed spares rounded down to last full row
 	 * and then to the last full group. An additional 32MB of scratch
 	 * space is reserved at the end of each child for use by the dRAID
 	 * expansion feature.
 	 */
 	uint64_t child_asize, child_max_asize;
 	vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
 	    logical_ashift, physical_ashift);
 
 	/*
 	 * Should be unreachable since the minimum child size is 64MB, but
 	 * we want to make sure an underflow absolutely cannot occur here.
 	 */
 	if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
 	    child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 	child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 
 	*asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 	*max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 
 	return (0);
 }
 
 /*
  * Close a top-level dRAID vdev.
  */
 static void
 vdev_draid_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range
  * given the following constraints.  A dRAID chunks may not:
  *
  * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
  * - Span dRAID redundancy groups.
  */
 static uint64_t
 vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t ashift = vd->vdev_ashift;
 	uint64_t ndata = vdc->vdc_ndata;
 	uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
 	ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
 
 	/* Chunks must evenly span all data columns in the group. */
 	psize = (((psize >> ashift) / ndata) * ndata) << ashift;
 	uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
 
 	/* Reduce the chunk size to the group space remaining. */
 	uint64_t group = vdev_draid_offset_to_group(vd, start);
 	uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
 	chunk_size = MIN(chunk_size, left);
 
 	ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
 	ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
 	    vdev_draid_offset_to_group(vd, start + chunk_size - 1));
 
 	return (chunk_size);
 }
 
 /*
  * Align the start of the metaslab to the group width and slightly reduce
  * its size to a multiple of the group width.  Since full stripe writes are
  * required by dRAID this space is unallocable.  Furthermore, aligning the
  * metaslab start is important for vdev initialize and TRIM which both operate
  * on metaslab boundaries which vdev_xlate() expects to be aligned.
  */
 static void
 vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
 	uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
 	uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
 
 	*ms_start = astart;
 	*ms_size = asize;
 
 	ASSERT0(*ms_start % sz);
 	ASSERT0(*ms_size % sz);
 }
 
 /*
  * Add virtual dRAID spares to the list of valid spares. In order to accomplish
  * this the existing array must be freed and reallocated with the additional
  * entries.
  */
 int
 vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
     uint64_t next_vdev_id)
 {
 	uint64_t draid_nspares = 0;
 	uint64_t ndraid = 0;
 	int error;
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_t *cvd = vd->vdev_child[i];
 
 		if (cvd->vdev_ops == &vdev_draid_ops) {
 			vdev_draid_config_t *vdc = cvd->vdev_tsd;
 			draid_nspares += vdc->vdc_nspares;
 			ndraid++;
 		}
 	}
 
 	if (draid_nspares == 0) {
 		*ndraidp = ndraid;
 		return (0);
 	}
 
 	nvlist_t **old_spares, **new_spares;
 	uint_t old_nspares;
 	error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &old_spares, &old_nspares);
 	if (error)
 		old_nspares = 0;
 
 	/* Allocate memory and copy of the existing spares. */
 	new_spares = kmem_alloc(sizeof (nvlist_t *) *
 	    (draid_nspares + old_nspares), KM_SLEEP);
 	for (uint_t i = 0; i < old_nspares; i++)
 		new_spares[i] = fnvlist_dup(old_spares[i]);
 
 	/* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
 	uint64_t n = old_nspares;
 	for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
 		vdev_t *cvd = vd->vdev_child[vdev_id];
 		char path[64];
 
 		if (cvd->vdev_ops != &vdev_draid_ops)
 			continue;
 
 		vdev_draid_config_t *vdc = cvd->vdev_tsd;
 		uint64_t nspares = vdc->vdc_nspares;
 		uint64_t nparity = vdc->vdc_nparity;
 
 		for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
 			memset(path, 0, sizeof (path));
 			(void) snprintf(path, sizeof (path) - 1,
 			    "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
 			    (u_longlong_t)nparity,
 			    (u_longlong_t)next_vdev_id + vdev_id,
 			    (u_longlong_t)spare_id);
 
 			nvlist_t *spare = fnvlist_alloc();
 			fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
 			fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
 			    VDEV_TYPE_DRAID_SPARE);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
 			    cvd->vdev_guid);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
 			    spare_id);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
 			    cvd->vdev_ashift);
 
 			new_spares[n] = spare;
 			n++;
 		}
 	}
 
 	if (n > 0) {
 		(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t **)new_spares, n);
 	}
 
 	for (int i = 0; i < n; i++)
 		nvlist_free(new_spares[i]);
 
 	kmem_free(new_spares, sizeof (*new_spares) * n);
 	*ndraidp = ndraid;
 
 	return (0);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.
  */
 static boolean_t
 vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = vdev_draid_asize(vd, psize, 0);
 
 	if (phys_birth == TXG_UNKNOWN) {
 		/*
 		 * Sequential resilver.  There is no meaningful phys_birth
 		 * for this block, we can only determine if block resides
 		 * in a degraded group in which case it must be resilvered.
 		 */
 		ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
 		    vdev_draid_offset_to_group(vd, offset + asize - 1));
 
 		return (vdev_draid_group_degraded(vd, offset));
 	} else {
 		/*
 		 * Healing resilver.  TXGs not in DTL_PARTIAL are intact,
 		 * as are blocks in non-degraded groups.
 		 */
 		if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 			return (B_FALSE);
 
 		if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
 			return (B_TRUE);
 
 		/* The block may span groups in which case check both. */
 		if (vdev_draid_offset_to_group(vd, offset) !=
 		    vdev_draid_offset_to_group(vd, offset + asize - 1)) {
 			if (vdev_draid_group_missing(vd,
 			    offset + asize, phys_birth, 1))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 }
 
 static boolean_t
 vdev_draid_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_draid_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 {
 #ifdef ZFS_DEBUG
 	range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_draid_asize(vd, rr->rr_size, 0);
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
 #endif
 }
 
 /*
  * For write operations:
  * 1. Generate the parity data
  * 2. Create child zio write operations to each column's vdev, for both
  *    data and parity.  A gang ABD is allocated by vdev_draid_map_alloc()
  *    if a skip sector needs to be added to a column.
  */
 static void
 vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Empty columns are zero filled and included in the parity
 		 * calculation and therefore must be written.
 		 */
 		ASSERT3U(rc->rc_size, !=, 0);
 
 		/* Verify physical to logical translation */
 		vdev_draid_io_verify(vd, rr, c);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
 		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
 		    0, vdev_raidz_child_done, rc));
 	}
 }
 
 /*
  * For read operations:
  * 1. The vdev_draid_map_alloc() function will create a minimal raidz
  *    mapping for the read based on the zio->io_flags.  There are two
  *    possible mappings either 1) a normal read, or 2) a scrub/resilver.
  * 2. Create the zio read operations.  This will include all parity
  *    columns and skip sectors for a scrub/resilver.
  */
 static void
 vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/* Sequential rebuild must do IO at redundancy group boundary. */
 	IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last.  Any errors along the way will force us to read the parity.
 	 * For scrub/resilver IOs which verify skip sectors, a gang ABD will
 	 * have been allocated to store them and rc->rc_size is increased.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (!vdev_draid_readable(cvd, rc->rc_offset)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		/*
 		 * Empty columns may be read during vdev_draid_io_done().
 		 * Only skip them after the readable and missing checks
 		 * verify they are available.
 		 */
 		if (rc->rc_size == 0) {
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (zio->io_flags & ZIO_FLAG_RESILVER) {
 			vdev_t *svd;
 
 			/*
 			 * Sequential rebuilds need to always consider the data
 			 * on the child being rebuilt to be stale.  This is
 			 * important when all columns are available to aid
 			 * known reconstruction in identifing which columns
 			 * contain incorrect data.
 			 *
 			 * Furthermore, all repairs need to be constrained to
 			 * the devices being rebuilt because without a checksum
 			 * we cannot verify the data is actually correct and
 			 * performing an incorrect repair could result in
 			 * locking in damage and making the data unrecoverable.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				if (vdev_draid_rebuilding(cvd)) {
 					if (c >= rr->rr_firstdatacol)
 						rr->rr_missingdata++;
 					else
 						rr->rr_missingparity++;
 					rc->rc_error = SET_ERROR(ESTALE);
 					rc->rc_skipped = 1;
 					rc->rc_allow_repair = 1;
 					continue;
 				} else {
 					rc->rc_allow_repair = 0;
 				}
 			} else {
 				rc->rc_allow_repair = 1;
 			}
 
 			/*
 			 * If this child is a distributed spare then the
 			 * offset might reside on the vdev being replaced.
 			 * In which case this data must be written to the
 			 * new device.  Failure to do so would result in
 			 * checksum errors when the old device is detached
 			 * and the pool is scrubbed.
 			 */
 			if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
 				svd = vdev_draid_spare_get_child(svd,
 				    rc->rc_offset);
 				if (svd && (svd->vdev_ops == &vdev_spare_ops ||
 				    svd->vdev_ops == &vdev_replacing_ops)) {
 					rc->rc_force_repair = 1;
 
 					if (vdev_draid_rebuilding(svd))
 						rc->rc_allow_repair = 1;
 				}
 			}
 
 			/*
 			 * Always issue a repair IO to this child when its
 			 * a spare or replacing vdev with an active rebuild.
 			 */
 			if ((cvd->vdev_ops == &vdev_spare_ops ||
 			    cvd->vdev_ops == &vdev_replacing_ops) &&
 			    vdev_draid_rebuilding(cvd)) {
 				rc->rc_force_repair = 1;
 				rc->rc_allow_repair = 1;
 			}
 		}
 	}
 
 	/*
 	 * Either a parity or data column is missing this means a repair
 	 * may be attempted by vdev_draid_io_done().  Expand the raid map
 	 * to read in empty columns which are needed along with the parity
 	 * during reconstruction.
 	 */
 	if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
 	    rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
 		vdev_draid_map_alloc_empty(zio, rr);
 	}
 
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (rc->rc_error || rc->rc_size == 0)
 			continue;
 
 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 /*
  * Start an IO operation to a dRAID vdev.
  */
 static void
 vdev_draid_io_start(zio_t *zio)
 {
 	vdev_t *vd __maybe_unused = zio->io_vd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
 
 	raidz_map_t *rm = vdev_draid_map_alloc(zio);
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_write(zio, rm->rm_row[i]);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_read(zio, rm->rm_row[i]);
 		}
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Complete an IO operation on a dRAID vdev.  The raidz logic can be applied
  * to dRAID since the layout is fully described by the raidz_map_t.
  */
 static void
 vdev_draid_io_done(zio_t *zio)
 {
 	vdev_raidz_io_done(zio);
 }
 
 static void
 vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	ASSERT(vd->vdev_ops == &vdev_draid_ops);
 
 	if (faulted > vdc->vdc_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 static void
 vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
 {
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
 
 	vdev_draid_config_t *vdc = raidvd->vdev_tsd;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* Make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 
 	uint64_t logical_start = logical_rs->rs_start;
 	uint64_t logical_end = logical_rs->rs_end;
 
 	/*
 	 * Unaligned ranges must be skipped. All metaslabs are correctly
 	 * aligned so this should not happen, but this case is handled in
 	 * case it's needed by future callers.
 	 */
 	uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
 	if (astart != logical_start) {
 		physical_rs->rs_start = logical_start;
 		physical_rs->rs_end = logical_start;
 		remain_rs->rs_start = MIN(astart, logical_end);
 		remain_rs->rs_end = logical_end;
 		return;
 	}
 
 	/*
 	 * Unlike with mirrors and raidz a dRAID logical range can map
 	 * to multiple non-contiguous physical ranges. This is handled by
 	 * limiting the size of the logical range to a single group and
 	 * setting the remain argument such that it describes the remaining
 	 * unmapped logical range. This is stricter than absolutely
 	 * necessary but helps simplify the logic below.
 	 */
 	uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
 	uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
 	if (logical_end > nextstart)
 		logical_end = nextstart;
 
 	/* Find the starting offset for each vdev in the group */
 	uint64_t perm, groupstart;
 	uint64_t start = vdev_draid_logical_to_physical(raidvd,
 	    logical_start, &perm, &groupstart);
 	uint64_t end = start;
 
 	uint8_t *base;
 	uint64_t iter, id;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	/*
 	 * Check if the passed child falls within the group.  If it does
 	 * update the start and end to reflect the physical range.
 	 * Otherwise, leave them unmodified which will result in an empty
 	 * (zero-length) physical range being returned.
 	 */
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 
 		if (c == 0 && i != 0) {
 			/* the group wrapped, increment the start */
 			start += VDEV_DRAID_ROWHEIGHT;
 			end = start;
 		}
 
 		id = vdev_draid_permute_id(vdc, base, iter, c);
 		if (id == cvd->vdev_id) {
 			uint64_t b_size = (logical_end >> ashift) -
 			    (logical_start >> ashift);
 			ASSERT3U(b_size, >, 0);
 			end = start + ((((b_size - 1) /
 			    vdc->vdc_groupwidth) + 1) << ashift);
 			break;
 		}
 	}
 	physical_rs->rs_start = start;
 	physical_rs->rs_end = end;
 
 	/*
 	 * Only top-level vdevs are allowed to set remain_rs because
 	 * when .vdev_op_xlate() is called for their children the full
 	 * logical range is not provided by vdev_xlate().
 	 */
 	remain_rs->rs_start = logical_end;
 	remain_rs->rs_end = logical_rs->rs_end;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_end - logical_start);
 }
 
 /*
  * Add dRAID specific fields to the config nvlist.
  */
 static void
 vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
 }
 
 /*
  * Initialize private dRAID specific fields from the nvlist.
  */
 static int
 vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	(void) spa;
 	uint64_t ndata, nparity, nspares, ngroups;
 	int error;
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
 	    nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	uint_t children;
 	nvlist_t **child;
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0 || children == 0 ||
 	    children > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
 	    nspares > 100 || nspares > (children - (ndata + nparity))) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
 	    ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Validate the minimum number of children exist per group for the
 	 * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
 	 */
 	if (children < (ndata + nparity + nspares))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Create the dRAID configuration using the pool nvlist configuration
 	 * and the fixed mapping for the correct number of children.
 	 */
 	vdev_draid_config_t *vdc;
 	const draid_map_t *map;
 
 	error = vdev_draid_lookup_map(children, &map);
 	if (error)
 		return (SET_ERROR(EINVAL));
 
 	vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
 	vdc->vdc_ndata = ndata;
 	vdc->vdc_nparity = nparity;
 	vdc->vdc_nspares = nspares;
 	vdc->vdc_children = children;
 	vdc->vdc_ngroups = ngroups;
 	vdc->vdc_nperms = map->dm_nperms;
 
 	error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
 	if (error) {
 		kmem_free(vdc, sizeof (*vdc));
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Derived constants.
 	 */
 	vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
 	vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
 	vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
 	vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
 	    vdc->vdc_ndisks;
 
 	ASSERT3U(vdc->vdc_groupwidth, >=, 2);
 	ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
 	ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
 	ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
 	    vdc->vdc_ndisks, ==, 0);
 
 	*tsd = vdc;
 
 	return (0);
 }
 
 static void
 vdev_draid_fini(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
 	    vdc->vdc_children * vdc->vdc_nperms);
 	kmem_free(vdc, sizeof (*vdc));
 }
 
 static uint64_t
 vdev_draid_nparity(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_nparity);
 }
 
 static uint64_t
 vdev_draid_ndisks(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_ndisks);
 }
 
 vdev_ops_t vdev_draid_ops = {
 	.vdev_op_init = vdev_draid_init,
 	.vdev_op_fini = vdev_draid_fini,
 	.vdev_op_open = vdev_draid_open,
 	.vdev_op_close = vdev_draid_close,
 	.vdev_op_asize = vdev_draid_asize,
 	.vdev_op_min_asize = vdev_draid_min_asize,
 	.vdev_op_min_alloc = vdev_draid_min_alloc,
 	.vdev_op_io_start = vdev_draid_io_start,
 	.vdev_op_io_done = vdev_draid_io_done,
 	.vdev_op_state_change = vdev_draid_state_change,
 	.vdev_op_need_resilver = vdev_draid_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_draid_xlate,
 	.vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
 	.vdev_op_metaslab_init = vdev_draid_metaslab_init,
 	.vdev_op_config_generate = vdev_draid_config_generate,
 	.vdev_op_nparity = vdev_draid_nparity,
 	.vdev_op_ndisks = vdev_draid_ndisks,
 	.vdev_op_type = VDEV_TYPE_DRAID,
 	.vdev_op_leaf = B_FALSE,
 };
 
 
 /*
  * A dRAID distributed spare is a virtual leaf vdev which is included in the
  * parent dRAID configuration.  The last N columns of the dRAID permutation
  * table are used to determine on which dRAID children a specific offset
  * should be written.  These spare leaf vdevs can only be used to replace
  * faulted children in the same dRAID configuration.
  */
 
 /*
  * Distributed spare state.  All fields are set when the distributed spare is
  * first opened and are immutable.
  */
 typedef struct {
 	vdev_t *vds_draid_vdev;		/* top-level parent dRAID vdev */
 	uint64_t vds_top_guid;		/* top-level parent dRAID guid */
 	uint64_t vds_spare_id;		/* spare id (0 - vdc->vdc_nspares-1) */
 } vdev_draid_spare_t;
 
 /*
  * Returns the parent dRAID vdev to which the distributed spare belongs.
  * This may be safely called even when the vdev is not open.
  */
 vdev_t *
 vdev_draid_spare_get_parent(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	if (vds->vds_draid_vdev != NULL)
 		return (vds->vds_draid_vdev);
 
 	return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
 	    vds->vds_top_guid));
 }
 
 /*
  * A dRAID space is active when it's the child of a vdev using the
  * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
  */
 static boolean_t
 vdev_draid_spare_is_active(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
 	    pvd->vdev_ops == &vdev_replacing_ops ||
 	    pvd->vdev_ops == &vdev_draid_ops)) {
 		return (B_TRUE);
 	} else {
 		return (B_FALSE);
 	}
 }
 
 /*
  * Given a dRAID distribute spare vdev, returns the physical child vdev
  * on which the provided offset resides.  This may involve recursing through
  * multiple layers of distributed spares.  Note that offset is relative to
  * this vdev.
  */
 vdev_t *
 vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	/* The vdev is closed */
 	if (vds->vds_draid_vdev == NULL)
 		return (NULL);
 
 	vdev_t *tvd = vds->vds_draid_vdev;
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 
 	ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
 
 	uint8_t *base;
 	uint64_t iter;
 	uint64_t perm = physical_offset / vdc->vdc_devslicesz;
 
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
 	    (tvd->vdev_children - 1) - vds->vds_spare_id);
 	vdev_t *cvd = tvd->vdev_child[cid];
 
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return (vdev_draid_spare_get_child(cvd, physical_offset));
 
 	return (cvd);
 }
 
 static void
 vdev_draid_spare_close(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vds->vds_draid_vdev = NULL;
 }
 
 /*
  * Opening a dRAID spare device is done by looking up the associated dRAID
  * top-level vdev guid from the spare configuration.
  */
 static int
 vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
 	uint64_t asize, max_asize;
 
 	vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
 	if (tvd == NULL) {
 		/*
 		 * When spa_vdev_add() is labeling new spares the
 		 * associated dRAID is not attached to the root vdev
 		 * nor does this spare have a parent.  Simulate a valid
 		 * device in order to allow the label to be initialized
 		 * and the distributed spare added to the configuration.
 		 */
 		if (vd->vdev_parent == NULL) {
 			*psize = *max_psize = SPA_MINDEVSIZE;
 			*logical_ashift = *physical_ashift = ASHIFT_MIN;
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 	if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (vds->vds_spare_id >= vdc->vdc_nspares)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
 	 * because the caller may be vdev_draid_open() in which case the
 	 * values are stale as they haven't yet been updated by vdev_open().
 	 * To avoid this always recalculate the dRAID asize and max_asize.
 	 */
 	vdev_draid_calculate_asize(tvd, &asize, &max_asize,
 	    logical_ashift, physical_ashift);
 
 	*psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 
 	vds->vds_draid_vdev = tvd;
 
 	return (0);
 }
 
 /*
  * Completed distributed spare IO.  Store the result in the parent zio
  * as if it had performed the operation itself.  Only the first error is
  * preserved if there are multiple errors.
  */
 static void
 vdev_draid_spare_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	/*
 	 * IOs are issued to non-writable vdevs in order to keep their
 	 * DTLs accurate.  However, we don't want to propagate the
 	 * error in to the distributed spare's DTL.  When resilvering
 	 * vdev_draid_need_resilver() will consult the relevant DTL
 	 * to determine if the data is missing and must be repaired.
 	 */
 	if (!vdev_writeable(zio->io_vd))
 		return;
 
 	if (pio->io_error == 0)
 		pio->io_error = zio->io_error;
 }
 
 /*
  * Returns a valid label nvlist for the distributed spare vdev.  This is
  * used to bypass the IO pipeline to avoid the complexity of constructing
  * a complete label with valid checksum to return when read.
  */
 nvlist_t *
 vdev_draid_read_config_spare(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	uint64_t guid = vd->vdev_guid;
 
 	nvlist_t *nv = fnvlist_alloc();
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
 	    vdev_draid_spare_is_active(vd) ?
 	    POOL_STATE_ACTIVE : POOL_STATE_SPARE);
 
 	/* Set the vdev guid based on the vdev list in sav_count. */
 	for (int i = 0; i < sav->sav_count; i++) {
 		if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
 		    strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
 			guid = sav->sav_vdevs[i]->vdev_guid;
 			break;
 		}
 	}
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
 
 	return (nv);
 }
 
 /*
  * Handle any ioctl requested of the distributed spare.  Only flushes
  * are supported in which case all children must be flushed.
  */
 static int
 vdev_draid_spare_ioctl(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
 
-	if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
-		for (int c = 0; c < vd->vdev_children; c++) {
-			zio_nowait(zio_vdev_child_io(zio, NULL,
-			    vd->vdev_child[c], zio->io_offset, zio->io_abd,
-			    zio->io_size, zio->io_type, zio->io_priority, 0,
-			    vdev_draid_spare_child_done, zio));
-		}
-	} else {
-		error = SET_ERROR(ENOTSUP);
+	for (int c = 0; c < vd->vdev_children; c++) {
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_draid_spare_child_done, zio));
 	}
 
 	return (error);
 }
 
 /*
  * Initiate an IO to the distributed spare.  For normal IOs this entails using
  * the zio->io_offset and permutation table to calculate which child dRAID vdev
  * is responsible for the data.  Then passing along the zio to that child to
  * perform the actual IO.  The label ranges are not stored on disk and require
  * some special handling which is described below.
  */
 static void
 vdev_draid_spare_io_start(zio_t *zio)
 {
 	vdev_t *cvd = NULL, *vd = zio->io_vd;
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vds == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_IOCTL:
 		zio->io_error = vdev_draid_spare_ioctl(zio);
 		break;
 
 	case ZIO_TYPE_WRITE:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs and config writers to simulate the
 			 * existence of an on disk label.  vdev_label_sync(),
 			 * vdev_uberblock_sync() and vdev_copy_uberblocks()
 			 * skip the distributed spares.  This only leaves
 			 * vdev_label_init() which is allowed to succeed to
 			 * avoid adding special cases the function.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE ||
 			    zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_READ:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs to simulate the existence of a
 			 * label.  vdev_label_read_config() bypasses the
 			 * pipeline to read the label configuration and
 			 * vdev_uberblock_load() skips distributed spares
 			 * when attempting to locate the best uberblock.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL || !vdev_readable(cvd)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_TRIM:
 		/* The vdev label ranges are never trimmed */
 		ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
 
 		cvd = vdev_draid_spare_get_child(vd, offset);
 
 		if (cvd == NULL || !cvd->vdev_has_trim) {
 			zio->io_error = SET_ERROR(ENXIO);
 		} else {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    offset, zio->io_abd, zio->io_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_draid_spare_child_done, zio));
 		}
 		break;
 
 	default:
 		zio->io_error = SET_ERROR(ENOTSUP);
 		break;
 	}
 
 	zio_execute(zio);
 }
 
 static void
 vdev_draid_spare_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 /*
  * Lookup the full spare config in spa->spa_spares.sav_config and
  * return the top_guid and spare_id for the named spare.
  */
 static int
 vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
     uint64_t *spare_idp)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int error;
 
 	if ((spa->spa_spares.sav_config == NULL) ||
 	    (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	const char *spare_name;
 	error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (int i = 0; i < nspares; i++) {
 		nvlist_t *spare = spares[i];
 		uint64_t top_guid, spare_id;
 		const char *type, *path;
 
 		/* Skip non-distributed spares */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
 		if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
 			continue;
 
 		/* Skip spares with the wrong name */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
 		if (error != 0 || strcmp(path, spare_name) != 0)
 			continue;
 
 		/* Found the matching spare */
 		error = nvlist_lookup_uint64(spare,
 		    ZPOOL_CONFIG_TOP_GUID, &top_guid);
 		if (error == 0) {
 			error = nvlist_lookup_uint64(spare,
 			    ZPOOL_CONFIG_SPARE_ID, &spare_id);
 		}
 
 		if (error != 0) {
 			return (SET_ERROR(EINVAL));
 		} else {
 			*top_guidp = top_guid;
 			*spare_idp = spare_id;
 			return (0);
 		}
 	}
 
 	return (SET_ERROR(ENOENT));
 }
 
 /*
  * Initialize private dRAID spare specific fields from the nvlist.
  */
 static int
 vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	vdev_draid_spare_t *vds;
 	uint64_t top_guid = 0;
 	uint64_t spare_id;
 
 	/*
 	 * In the normal case check the list of spares stored in the spa
 	 * to lookup the top_guid and spare_id for provided spare config.
 	 * When creating a new pool or adding vdevs the spare list is not
 	 * yet populated and the values are provided in the passed config.
 	 */
 	if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
 		    &spare_id) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
 	vds->vds_draid_vdev = NULL;
 	vds->vds_top_guid = top_guid;
 	vds->vds_spare_id = spare_id;
 
 	*tsd = vds;
 
 	return (0);
 }
 
 static void
 vdev_draid_spare_fini(vdev_t *vd)
 {
 	kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
 }
 
 static void
 vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
 }
 
 vdev_ops_t vdev_draid_spare_ops = {
 	.vdev_op_init = vdev_draid_spare_init,
 	.vdev_op_fini = vdev_draid_spare_fini,
 	.vdev_op_open = vdev_draid_spare_open,
 	.vdev_op_close = vdev_draid_spare_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_draid_spare_io_start,
 	.vdev_op_io_done = vdev_draid_spare_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_draid_spare_config_generate,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DRAID_SPARE,
 	.vdev_op_leaf = B_TRUE,
 };
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 481af2ba826b..2f43c4aa41b8 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -1,1598 +1,1595 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012,2021 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/sysevent.h>
 
 /*
  * This general routine is responsible for generating all the different ZFS
  * ereports.  The payload is dependent on the class, and which arguments are
  * supplied to the function:
  *
  * 	EREPORT			POOL	VDEV	IO
  * 	block			X	X	X
  * 	data			X		X
  * 	device			X	X
  * 	pool			X
  *
  * If we are in a loading state, all errors are chained together by the same
  * SPA-wide ENA (Error Numeric Association).
  *
  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
  * to chain together all ereports associated with a logical piece of data.  For
  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
  * layered diagram:
  *
  * 	+---------------+
  * 	| Aggregate I/O |	No associated logical data or device
  * 	+---------------+
  *              |
  *              V
  * 	+---------------+	Reads associated with a piece of logical data.
  * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
  * 	+---------------+       mirrors, gang blocks, retries, etc.
  *              |
  *              V
  * 	+---------------+	Reads associated with a particular device, but
  * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
  * 	+---------------+	and I/O aggregation.
  *
  * Note that 'physical I/O' here is not the same terminology as used in the rest
  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
  * blockpointer.  But I/O with no associated block pointer can still be related
  * to a logical piece of data (i.e. RAID-Z requests).
  *
  * Purely physical I/O always have unique ENAs.  They are not related to a
  * particular piece of logical data, and therefore cannot be chained together.
  * We still generate an ereport, but the DE doesn't correlate it with any
  * logical piece of data.  When such an I/O fails, the delegated I/O requests
  * will issue a retry, which will trigger the 'real' ereport with the correct
  * ENA.
  *
  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  * then inherit this pointer, so that when it is first set subsequent failures
  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
  * this pointer is set to NULL, and no ereport will be generated (since it
  * doesn't actually correspond to any particular device or piece of data,
  * and the caller will always retry without caching or queueing anyway).
  *
  * For checksum errors, we want to include more information about the actual
  * error which occurs.  Accordingly, we build an ereport when the error is
  * noticed, but instead of sending it in immediately, we hang it off of the
  * io_cksum_report field of the logical IO.  When the logical IO completes
  * (successfully or not), zfs_ereport_finish_checksum() is called with the
  * good and bad versions of the buffer (if available), and we annotate the
  * ereport with information about the differences.
  */
 
 #ifdef _KERNEL
 /*
  * Duplicate ereport Detection
  *
  * Some ereports are retained momentarily for detecting duplicates.  These
  * are kept in a recent_events_node_t in both a time-ordered list and an AVL
  * tree of recent unique ereports.
  *
  * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
  * task is used to purge stale entries.
  */
 static list_t recent_events_list;
 static avl_tree_t recent_events_tree;
 static kmutex_t recent_events_lock;
 static taskqid_t recent_events_cleaner_tqid;
 
 /*
  * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
  *
  * This setting can be changed dynamically and setting it to zero
  * disables duplicate detection.
  */
 static unsigned int zfs_zevent_retain_max = 2000;
 
 /*
  * The lifespan for a recent ereport entry. The default of 15 minutes is
  * intended to outlive the zfs diagnosis engine's threshold of 10 errors
  * over a period of 10 minutes.
  */
 static unsigned int zfs_zevent_retain_expire_secs = 900;
 
 typedef enum zfs_subclass {
 	ZSC_IO,
 	ZSC_DATA,
 	ZSC_CHECKSUM
 } zfs_subclass_t;
 
 typedef struct {
 	/* common criteria */
 	uint64_t	re_pool_guid;
 	uint64_t	re_vdev_guid;
 	int		re_io_error;
 	uint64_t	re_io_size;
 	uint64_t	re_io_offset;
 	zfs_subclass_t	re_subclass;
 	zio_priority_t	re_io_priority;
 
 	/* logical zio criteria (optional) */
 	zbookmark_phys_t re_io_bookmark;
 
 	/* internal state */
 	avl_node_t	re_tree_link;
 	list_node_t	re_list_link;
 	uint64_t	re_timestamp;
 } recent_events_node_t;
 
 static int
 recent_events_compare(const void *a, const void *b)
 {
 	const recent_events_node_t *node1 = a;
 	const recent_events_node_t *node2 = b;
 	int cmp;
 
 	/*
 	 * The comparison order here is somewhat arbitrary.
 	 * What's important is that if every criteria matches, then it
 	 * is a duplicate (i.e. compare returns 0)
 	 */
 	if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
 		return (cmp);
 
 	const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
 	const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
 
 	if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
 		return (cmp);
 
 	return (0);
 }
 
 /*
  * workaround: vdev properties don't have inheritance
  */
 static uint64_t
 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
 {
 	uint64_t propdef, propval;
 
 	propdef = vdev_prop_default_numeric(prop);
 	switch (prop) {
 		case VDEV_PROP_CHECKSUM_N:
 			propval = vd->vdev_checksum_n;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			propval = vd->vdev_checksum_t;
 			break;
 		case VDEV_PROP_IO_N:
 			propval = vd->vdev_io_n;
 			break;
 		case VDEV_PROP_IO_T:
 			propval = vd->vdev_io_t;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			propval = vd->vdev_slow_io_n;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			propval = vd->vdev_slow_io_t;
 			break;
 		default:
 			propval = propdef;
 			break;
 	}
 
 	if (propval != propdef)
 		return (propval);
 
 	if (vd->vdev_parent == NULL)
 		return (propdef);
 
 	return (vdev_prop_get_inherited(vd->vdev_parent, prop));
 }
 
 static void zfs_ereport_schedule_cleaner(void);
 
 /*
  * background task to clean stale recent event nodes.
  */
 static void
 zfs_ereport_cleaner(void *arg)
 {
 	recent_events_node_t *entry;
 	uint64_t now = gethrtime();
 
 	/*
 	 * purge expired entries
 	 */
 	mutex_enter(&recent_events_lock);
 	while ((entry = list_tail(&recent_events_list)) != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 		if (age <= zfs_zevent_retain_expire_secs)
 			break;
 
 		/* remove expired node */
 		avl_remove(&recent_events_tree, entry);
 		list_remove(&recent_events_list, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 
 	/* Restart the cleaner if more entries remain */
 	recent_events_cleaner_tqid = 0;
 	if (!list_is_empty(&recent_events_list))
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 }
 
 static void
 zfs_ereport_schedule_cleaner(void)
 {
 	ASSERT(MUTEX_HELD(&recent_events_lock));
 
 	uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
 
 	recent_events_cleaner_tqid = taskq_dispatch_delay(
 	    system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
 	    ddi_get_lbolt() + NSEC_TO_TICK(timeout));
 }
 
 /*
  * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
  */
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	uint64_t vdev_guid, pool_guid;
 
 	ASSERT(vd != NULL || spa != NULL);
 	if (vd == NULL) {
 		vdev_guid = 0;
 		pool_guid = spa_guid(spa);
 	} else {
 		vdev_guid = vd->vdev_guid;
 		pool_guid = 0;
 	}
 
 	mutex_enter(&recent_events_lock);
 
 	recent_events_node_t *next = list_head(&recent_events_list);
 	while (next != NULL) {
 		recent_events_node_t *entry = next;
 
 		next = list_next(&recent_events_list, next);
 
 		if (entry->re_vdev_guid == vdev_guid ||
 		    entry->re_pool_guid == pool_guid) {
 			avl_remove(&recent_events_tree, entry);
 			list_remove(&recent_events_list, entry);
 			kmem_free(entry, sizeof (*entry));
 		}
 	}
 
 	mutex_exit(&recent_events_lock);
 }
 
 /*
  * Check if an ereport would be a duplicate of one recently posted.
  *
  * An ereport is considered a duplicate if the set of criteria in
  * recent_events_node_t all match.
  *
  * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
  * are candidates for duplicate checking.
  */
 static boolean_t
 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
 {
 	recent_events_node_t search = {0}, *entry;
 
 	if (vd == NULL || zio == NULL)
 		return (B_FALSE);
 
 	if (zfs_zevent_retain_max == 0)
 		return (B_FALSE);
 
 	if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
 		search.re_subclass = ZSC_IO;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
 		search.re_subclass = ZSC_DATA;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
 		search.re_subclass = ZSC_CHECKSUM;
 	else
 		return (B_FALSE);
 
 	search.re_pool_guid = spa_guid(spa);
 	search.re_vdev_guid = vd->vdev_guid;
 	search.re_io_error = zio->io_error;
 	search.re_io_priority = zio->io_priority;
 	/* if size is supplied use it over what's in zio */
 	if (size) {
 		search.re_io_size = size;
 		search.re_io_offset = offset;
 	} else {
 		search.re_io_size = zio->io_size;
 		search.re_io_offset = zio->io_offset;
 	}
 
 	/* grab optional logical zio criteria */
 	if (zb != NULL) {
 		search.re_io_bookmark.zb_objset = zb->zb_objset;
 		search.re_io_bookmark.zb_object = zb->zb_object;
 		search.re_io_bookmark.zb_level = zb->zb_level;
 		search.re_io_bookmark.zb_blkid = zb->zb_blkid;
 	}
 
 	uint64_t now = gethrtime();
 
 	mutex_enter(&recent_events_lock);
 
 	/* check if we have seen this one recently */
 	entry = avl_find(&recent_events_tree, &search, NULL);
 	if (entry != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 
 		/*
 		 * There is still an active cleaner (since we're here).
 		 * Reset the last seen time for this duplicate entry
 		 * so that its lifespand gets extended.
 		 */
 		list_remove(&recent_events_list, entry);
 		list_insert_head(&recent_events_list, entry);
 		entry->re_timestamp = now;
 
 		zfs_zevent_track_duplicate();
 		mutex_exit(&recent_events_lock);
 
 		return (age <= zfs_zevent_retain_expire_secs);
 	}
 
 	if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
 		/* recycle oldest node */
 		entry = list_tail(&recent_events_list);
 		ASSERT(entry != NULL);
 		list_remove(&recent_events_list, entry);
 		avl_remove(&recent_events_tree, entry);
 	} else {
 		entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
 	}
 
 	/* record this as a recent ereport */
 	*entry = search;
 	avl_add(&recent_events_tree, entry);
 	list_insert_head(&recent_events_list, entry);
 	entry->re_timestamp = now;
 
 	/* Start a cleaner if not already scheduled */
 	if (recent_events_cleaner_tqid == 0)
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 	return (B_FALSE);
 }
 
 void
 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
 {
 	if (nvl)
 		fm_nvlist_destroy(nvl, FM_NVA_FREE);
 
 	if (detector)
 		fm_nvlist_destroy(detector, FM_NVA_FREE);
 }
 
 /*
  * We want to rate limit ZIO delay, deadman, and checksum events so as to not
  * flood zevent consumers when a disk is acting up.
  *
  * Returns 1 if we're ratelimiting, 0 if not.
  */
 static int
 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
 {
 	int rc = 0;
 	/*
 	 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
 	 * are.  Invert it to get our return value.
 	 */
 	if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_delay_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
 	}
 
 	if (rc)	{
 		/* We're rate limiting */
 		fm_erpt_dropped_increment();
 	}
 
 	return (rc);
 }
 
 /*
  * Return B_TRUE if the event actually posted, B_FALSE if not.
  */
 static boolean_t
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
     const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     zio_t *zio, uint64_t stateoroffset, uint64_t size)
 {
 	nvlist_t *ereport, *detector;
 
 	uint64_t ena;
 	char class[64];
 
 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
 		return (B_FALSE);
 
 	if ((detector = fm_nvlist_create(NULL)) == NULL) {
 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
 		return (B_FALSE);
 	}
 
 	/*
 	 * Serialize ereport generation
 	 */
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * Determine the ENA to use for this event.  If we are in a loading
 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE) {
 		if (spa->spa_ena == 0)
 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 		ena = spa->spa_ena;
 	} else if (zio != NULL && zio->io_logical != NULL) {
 		if (zio->io_logical->io_ena == 0)
 			zio->io_logical->io_ena =
 			    fm_ena_generate(0, FM_ENA_FMT1);
 		ena = zio->io_logical->io_ena;
 	} else {
 		ena = fm_ena_generate(0, FM_ENA_FMT1);
 	}
 
 	/*
 	 * Construct the full class, detector, and other standard FMA fields.
 	 */
 	(void) snprintf(class, sizeof (class), "%s.%s",
 	    ZFS_ERROR_CLASS, subclass);
 
 	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
 	    vd != NULL ? vd->vdev_guid : 0);
 
 	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
 
 	/*
 	 * Construct the per-ereport payload, depending on which parameters are
 	 * passed in.
 	 */
 
 	/*
 	 * Generic payload members common to all ereports.
 	 */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
 	    (uint64_t)spa_state(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
 	    (int32_t)spa_load_state(spa), NULL);
 
 	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
 	    DATA_TYPE_STRING,
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
 	    FM_EREPORT_FAILMODE_WAIT :
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
 	    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
 	    NULL);
 
 	if (vd != NULL) {
 		vdev_t *pvd = vd->vdev_parent;
 		vdev_queue_t *vq = &vd->vdev_queue;
 		vdev_stat_t *vs = &vd->vdev_stat;
 		vdev_t *spare_vd;
 		uint64_t *spare_guids;
 		char **spare_paths;
 		int i, spare_count;
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    DATA_TYPE_UINT64, vd->vdev_guid,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
 		if (vd->vdev_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
 			    DATA_TYPE_STRING, vd->vdev_path, NULL);
 		if (vd->vdev_devid != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
 			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
 		if (vd->vdev_fru != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
 			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
 		if (vd->vdev_enc_sysfs_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
 		if (vd->vdev_ashift)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
 			    DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
 
 		if (vq != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
 		}
 
 		if (vs != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_read_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_write_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
 			    DATA_TYPE_UINT64, vs->vs_slow_ios,
 			    NULL);
 		}
 
 		if (pvd != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
 			    DATA_TYPE_UINT64, pvd->vdev_guid,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
 			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
 			    NULL);
 			if (pvd->vdev_path)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
 				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
 			if (pvd->vdev_devid)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
 				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
 		}
 
 		spare_count = spa->spa_spares.sav_count;
 		spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
 		    KM_SLEEP);
 		spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
 		    KM_SLEEP);
 
 		for (i = 0; i < spare_count; i++) {
 			spare_vd = spa->spa_spares.sav_vdevs[i];
 			if (spare_vd) {
 				spare_paths[i] = spare_vd->vdev_path;
 				spare_guids[i] = spare_vd->vdev_guid;
 			}
 		}
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
 		    DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
 		    DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
 
 		kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
 		kmem_free(spare_paths, sizeof (char *) * spare_count);
 	}
 
 	if (zio != NULL) {
 		/*
 		 * Payload common to all I/Os.
 		 */
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
 		    DATA_TYPE_INT32, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
 		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
 		    DATA_TYPE_UINT64, zio->io_delay, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
 		    DATA_TYPE_UINT64, zio->io_timestamp, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
 		    DATA_TYPE_UINT64, zio->io_delta, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
 		    DATA_TYPE_UINT32, zio->io_priority, NULL);
 
 		/*
 		 * If the 'size' parameter is non-zero, it indicates this is a
 		 * RAID-Z or other I/O where the physical offset and length are
 		 * provided for us, instead of within the zio_t.
 		 */
 		if (vd != NULL) {
 			if (size)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, stateoroffset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, size, NULL);
 			else
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, zio->io_offset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, zio->io_size, NULL);
 		}
 	} else if (vd != NULL) {
 		/*
 		 * If we have a vdev but no zio, this is a device fault, and the
 		 * 'stateoroffset' parameter indicates the previous state of the
 		 * vdev.
 		 */
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 		    DATA_TYPE_UINT64, stateoroffset, NULL);
 	}
 
 	/*
 	 * Payload for I/Os with corresponding logical information.
 	 */
 	if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
 		    DATA_TYPE_UINT64, zb->zb_objset,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
 		    DATA_TYPE_UINT64, zb->zb_object,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
 		    DATA_TYPE_INT64, zb->zb_level,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
 		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
 	}
 
 	/*
 	 * Payload for tuning the zed
 	 */
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
 		uint64_t cksum_n, cksum_t;
 
 		cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
 		if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
 			    DATA_TYPE_UINT64,
 			    cksum_n,
 			    NULL);
 
 		cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
 		if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
 			    DATA_TYPE_UINT64,
 			    cksum_t,
 			    NULL);
 	}
 
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
 		uint64_t io_n, io_t;
 
 		io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
 		if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
 			    DATA_TYPE_UINT64,
 			    io_n,
 			    NULL);
 
 		io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
 		if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
 			    DATA_TYPE_UINT64,
 			    io_t,
 			    NULL);
 	}
 
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
 		uint64_t slow_io_n, slow_io_t;
 
 		slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
 		if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
 			    DATA_TYPE_UINT64,
 			    slow_io_n,
 			    NULL);
 
 		slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
 		if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
 			    DATA_TYPE_UINT64,
 			    slow_io_t,
 			    NULL);
 	}
 
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
 	*detector_out = detector;
 	return (B_TRUE);
 }
 
 /* if it's <= 128 bytes, save the corruption directly */
 #define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
 
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
 
 	/*
 	 * for each range, the number of bits set and cleared.  The Hamming
 	 * distance between the good and bad buffers is the sum of them all.
 	 */
 	uint32_t zei_range_sets[MAX_RANGES];
 	uint32_t zei_range_clears[MAX_RANGES];
 
 	struct zei_ranges {
 		uint32_t	zr_start;
 		uint32_t	zr_end;
 	} zei_ranges[MAX_RANGES];
 
 	size_t	zei_range_count;
 	uint32_t zei_mingap;
 	uint32_t zei_allowed_mingap;
 
 } zfs_ecksum_info_t;
 
 static void
 update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
 	uint64_t value = BE_64(value_arg);
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
 		if (value & (1ull << i))
 			++bits;
 	}
 	/* update the count of bits changed */
 	*count += bits;
 }
 
 /*
  * We've now filled up the range array, and need to increase "mingap" and
  * shrink the range list accordingly.  zei_mingap is always the smallest
  * distance between array entries, so we set the new_allowed_gap to be
  * one greater than that.  We then go through the list, joining together
  * any ranges which are closer than the new_allowed_gap.
  *
  * By construction, there will be at least one.  We also update zei_mingap
  * to the new smallest gap, to prepare for our next invocation.
  */
 static void
 zei_shrink_ranges(zfs_ecksum_info_t *eip)
 {
 	uint32_t mingap = UINT32_MAX;
 	uint32_t new_allowed_gap = eip->zei_mingap + 1;
 
 	size_t idx, output;
 	size_t max = eip->zei_range_count;
 
 	struct zei_ranges *r = eip->zei_ranges;
 
 	ASSERT3U(eip->zei_range_count, >, 0);
 	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
 
 	output = idx = 0;
 	while (idx < max - 1) {
 		uint32_t start = r[idx].zr_start;
 		uint32_t end = r[idx].zr_end;
 
 		while (idx < max - 1) {
 			idx++;
 
 			uint32_t nstart = r[idx].zr_start;
 			uint32_t nend = r[idx].zr_end;
 
 			uint32_t gap = nstart - end;
 			if (gap < new_allowed_gap) {
 				end = nend;
 				continue;
 			}
 			if (gap < mingap)
 				mingap = gap;
 			break;
 		}
 		r[output].zr_start = start;
 		r[output].zr_end = end;
 		output++;
 	}
 	ASSERT3U(output, <, eip->zei_range_count);
 	eip->zei_range_count = output;
 	eip->zei_mingap = mingap;
 	eip->zei_allowed_mingap = new_allowed_gap;
 }
 
 static void
 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 
 	if (count >= MAX_RANGES) {
 		zei_shrink_ranges(eip);
 		count = eip->zei_range_count;
 	}
 	if (count == 0) {
 		eip->zei_mingap = UINT32_MAX;
 		eip->zei_allowed_mingap = 1;
 	} else {
 		int gap = start - r[count - 1].zr_end;
 
 		if (gap < eip->zei_allowed_mingap) {
 			r[count - 1].zr_end = end;
 			return;
 		}
 		if (gap < eip->zei_mingap)
 			eip->zei_mingap = gap;
 	}
 	r[count].zr_start = start;
 	r[count].zr_end = end;
 	eip->zei_range_count++;
 }
 
 static size_t
 zei_range_total_size(zfs_ecksum_info_t *eip)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 	size_t result = 0;
 	size_t idx;
 
 	for (idx = 0; idx < count; idx++)
 		result += (r[idx].zr_end - r[idx].zr_start);
 
 	return (result);
 }
 
 static zfs_ecksum_info_t *
 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
     const abd_t *goodabd, const abd_t *badabd, size_t size,
     boolean_t drop_if_identical)
 {
 	const uint64_t *good;
 	const uint64_t *bad;
 
 	size_t nui64s = size / sizeof (uint64_t);
 
 	size_t inline_size;
 	int no_inline = 0;
 	size_t idx;
 	size_t range;
 
 	size_t offset = 0;
 	ssize_t start = -1;
 
 	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
 
 	/* don't do any annotation for injected checksum errors */
 	if (info != NULL && info->zbc_injected)
 		return (eip);
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
 		    NULL);
 
 		if (info->zbc_byteswapped) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
 			    DATA_TYPE_BOOLEAN, 1,
 			    NULL);
 		}
 	}
 
 	if (badabd == NULL || goodabd == NULL)
 		return (eip);
 
 	ASSERT3U(nui64s, <=, UINT32_MAX);
 	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, <=, UINT32_MAX);
 
 	good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
 	bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
 
 	/* build up the range list by comparing the two buffers. */
 	for (idx = 0; idx < nui64s; idx++) {
 		if (good[idx] == bad[idx]) {
 			if (start == -1)
 				continue;
 
 			zei_add_range(eip, start, idx);
 			start = -1;
 		} else {
 			if (start != -1)
 				continue;
 
 			start = idx;
 		}
 	}
 	if (start != -1)
 		zei_add_range(eip, start, idx);
 
 	/* See if it will fit in our inline buffers */
 	inline_size = zei_range_total_size(eip);
 	if (inline_size > ZFM_MAX_INLINE)
 		no_inline = 1;
 
 	/*
 	 * If there is no change and we want to drop if the buffers are
 	 * identical, do so.
 	 */
 	if (inline_size == 0 && drop_if_identical) {
 		kmem_free(eip, sizeof (*eip));
 		abd_return_buf((abd_t *)goodabd, (void *)good, size);
 		abd_return_buf((abd_t *)badabd, (void *)bad, size);
 		return (NULL);
 	}
 
 	/*
 	 * Now walk through the ranges, filling in the details of the
 	 * differences.  Also convert our uint64_t-array offsets to byte
 	 * offsets.
 	 */
 	for (range = 0; range < eip->zei_range_count; range++) {
 		size_t start = eip->zei_ranges[range].zr_start;
 		size_t end = eip->zei_ranges[range].zr_end;
 
 		for (idx = start; idx < end; idx++) {
 			uint64_t set, cleared;
 
 			// bits set in bad, but not in good
 			set = ((~good[idx]) & bad[idx]);
 			// bits set in good, but not in bad
 			cleared = (good[idx] & (~bad[idx]));
 
 			if (!no_inline) {
 				ASSERT3U(offset, <, inline_size);
 				eip->zei_bits_set[offset] = set;
 				eip->zei_bits_cleared[offset] = cleared;
 				offset++;
 			}
 
 			update_bad_bits(set, &eip->zei_range_sets[range]);
 			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
 		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
 		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
 	}
 
 	abd_return_buf((abd_t *)goodabd, (void *)good, size);
 	abd_return_buf((abd_t *)badabd, (void *)bad, size);
 
 	eip->zei_allowed_mingap	*= sizeof (uint64_t);
 	inline_size		*= sizeof (uint64_t);
 
 	/* fill in ereport */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
 	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
 	    (uint32_t *)eip->zei_ranges,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
 	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
 	    NULL);
 
 	if (!no_inline) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_set,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
 	}
 	return (eip);
 }
 #else
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	(void) spa, (void) vd;
 }
 #endif
 
 /*
  * Make sure our event is still valid for the given zio/vdev/pool.  For example,
  * we don't want to keep logging events for a faulted or missing vdev.
  */
 boolean_t
 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 {
 #ifdef _KERNEL
 	/*
 	 * If we are doing a spa_tryimport() or in recovery mode,
 	 * ignore errors.
 	 */
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
 	    spa_load_state(spa) == SPA_LOAD_RECOVER)
 		return (B_FALSE);
 
 	/*
 	 * If we are in the middle of opening a pool, and the previous attempt
 	 * failed, don't bother logging any new ereports - we're just going to
 	 * get the same diagnosis anyway.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE &&
 	    spa->spa_last_open_failed)
 		return (B_FALSE);
 
 	if (zio != NULL) {
-		/*
-		 * If this is not a read or write zio, ignore the error.  This
-		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
-		 */
+		/* If this is not a read or write zio, ignore the error */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
 
 		if (vd != NULL) {
 			/*
 			 * If the vdev has already been marked as failing due
 			 * to a failed probe, then ignore any subsequent I/O
 			 * errors, as the DE will automatically fault the vdev
 			 * on the first such failure.  This also catches cases
 			 * where vdev_remove_wanted is set and the device has
 			 * not yet been asynchronously placed into the REMOVED
 			 * state.
 			 */
 			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
 				return (B_FALSE);
 
 			/*
 			 * Ignore checksum errors for reads from DTL regions of
 			 * leaf vdevs.
 			 */
 			if (zio->io_type == ZIO_TYPE_READ &&
 			    zio->io_error == ECKSUM &&
 			    vd->vdev_ops->vdev_op_leaf &&
 			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
 				return (B_FALSE);
 		}
 	}
 
 	/*
 	 * For probe failure, we want to avoid posting ereports if we've
 	 * already removed the device in the meantime.
 	 */
 	if (vd != NULL &&
 	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
 	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
 		return (B_FALSE);
 
 	/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
 	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
 	    (zio != NULL) && (!zio->io_timestamp)) {
 		return (B_FALSE);
 	}
 #else
 	(void) subclass, (void) spa, (void) vd, (void) zio;
 #endif
 	return (B_TRUE);
 }
 
 /*
  * Post an ereport for the given subclass
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 
 	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
 		return (EINVAL);
 
 	if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(subclass, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
 	    zb, zio, state, 0))
 		return (SET_ERROR(EINVAL));	/* couldn't post event */
 
 	if (ereport == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/* Cleanup is handled by the callback function */
 	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #else
 	(void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
 	    (void) state;
 #endif
 	return (rc);
 }
 
 /*
  * Prepare a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
 {
 	zio_cksum_report_t *report;
 
 #ifdef _KERNEL
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 #else
 	(void) zb, (void) offset;
 #endif
 
 	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 
 	zio_vsd_default_cksum_report(zio, report);
 
 	/* copy the checksum failure information if it was provided */
 	if (info != NULL) {
 		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
 		memcpy(report->zcr_ckinfo, info, sizeof (*info));
 	}
 
 	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
 	report->zcr_align =
 	    vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
 	report->zcr_length = length;
 
 #ifdef _KERNEL
 	(void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
 	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
 
 	if (report->zcr_ereport == NULL) {
 		zfs_ereport_free_checksum(report);
 		return (0);
 	}
 #endif
 
 	mutex_enter(&spa->spa_errlist_lock);
 	report->zcr_next = zio->io_logical->io_cksum_report;
 	zio->io_logical->io_cksum_report = report;
 	mutex_exit(&spa->spa_errlist_lock);
 	return (0);
 }
 
 void
 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
     const abd_t *bad_data, boolean_t drop_if_identical)
 {
 #ifdef _KERNEL
 	zfs_ecksum_info_t *info;
 
 	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
 	    good_data, bad_data, report->zcr_length, drop_if_identical);
 	if (info != NULL)
 		zfs_zevent_post(report->zcr_ereport,
 		    report->zcr_detector, zfs_zevent_post_cb);
 	else
 		zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
 
 	report->zcr_ereport = report->zcr_detector = NULL;
 	if (info != NULL)
 		kmem_free(info, sizeof (*info));
 #else
 	(void) report, (void) good_data, (void) bad_data,
 	    (void) drop_if_identical;
 #endif
 }
 
 void
 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 {
 #ifdef _KERNEL
 	if (rpt->zcr_ereport != NULL) {
 		fm_nvlist_destroy(rpt->zcr_ereport,
 		    FM_NVA_FREE);
 		fm_nvlist_destroy(rpt->zcr_detector,
 		    FM_NVA_FREE);
 	}
 #endif
 	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
 
 	if (rpt->zcr_ckinfo != NULL)
 		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
 
 	kmem_free(rpt, sizeof (*rpt));
 }
 
 /*
  * Post a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length,
     const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 	zfs_ecksum_info_t *info;
 
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
 	    spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 	    B_FALSE);
 
 	if (info != NULL) {
 		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 		kmem_free(info, sizeof (*info));
 	}
 #else
 	(void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
 	    (void) length, (void) good_data, (void) bad_data, (void) zbc;
 #endif
 	return (rc);
 }
 
 /*
  * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
  * change in the pool.  All sysevents are listed in sys/sysevent/eventdefs.h
  * and are designed to be consumed by the ZFS Event Daemon (ZED).  For
  * additional details refer to the zed(8) man page.
  */
 nvlist_t *
 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 	nvlist_t *resource = NULL;
 #ifdef _KERNEL
 	char class[64];
 
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return (NULL);
 
 	if ((resource = fm_nvlist_create(NULL)) == NULL)
 		return (NULL);
 
 	(void) snprintf(class, sizeof (class), "%s.%s.%s", type,
 	    ZFS_ERROR_CLASS, name);
 	VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
 	VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
 	VERIFY0(nvlist_add_string(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
 	VERIFY0(nvlist_add_int32(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
 
 	if (vd) {
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
 		if (vd->vdev_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
 		if (vd->vdev_devid != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
 		if (vd->vdev_fru != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
 		if (vd->vdev_enc_sysfs_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path));
 	}
 
 	/* also copy any optional payload data */
 	if (aux) {
 		nvpair_t *elem = NULL;
 
 		while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
 			(void) nvlist_add_nvpair(resource, elem);
 	}
 #else
 	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 	return (resource);
 }
 
 static void
 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, type, name, aux);
 	if (resource)
 		zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
 #else
 	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 }
 
 /*
  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
  * has been removed from the system.  This will cause the DE to ignore any
  * recent I/O errors, inferring that they are due to the asynchronous device
  * removal.
  */
 void
 zfs_post_remove(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
 }
 
 /*
  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
  * has the 'autoreplace' property set, and therefore any broken vdevs will be
  * handled by higher level logic, and no vdev fault should be generated.
  */
 void
 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
 }
 
 /*
  * The 'resource.fs.zfs.statechange' event is an internal signal that the
  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
  * cause the retire agent to repair any outstanding fault management cases
  * open because the device was not found (fault.fs.zfs.device).
  */
 void
 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 {
 #ifdef _KERNEL
 	nvlist_t *aux;
 
 	/*
 	 * Add optional supplemental keys to payload
 	 */
 	aux = fm_nvlist_create(NULL);
 	if (vd && aux) {
 		if (vd->vdev_physpath) {
 			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
 			    vd->vdev_physpath);
 		}
 		if (vd->vdev_enc_sysfs_path) {
 			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path);
 		}
 
 		fnvlist_add_uint64(aux,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
 	}
 
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
 	    aux);
 
 	if (aux)
 		fm_nvlist_destroy(aux, FM_NVA_FREE);
 #else
 	(void) spa, (void) vd, (void) laststate;
 #endif
 }
 
 #ifdef _KERNEL
 void
 zfs_ereport_init(void)
 {
 	mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&recent_events_list, sizeof (recent_events_node_t),
 	    offsetof(recent_events_node_t, re_list_link));
 	avl_create(&recent_events_tree,  recent_events_compare,
 	    sizeof (recent_events_node_t), offsetof(recent_events_node_t,
 	    re_tree_link));
 }
 
 /*
  * This 'early' fini needs to run before zfs_fini() which on Linux waits
  * for the system_delay_taskq to drain.
  */
 void
 zfs_ereport_taskq_fini(void)
 {
 	mutex_enter(&recent_events_lock);
 	if (recent_events_cleaner_tqid != 0) {
 		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
 		recent_events_cleaner_tqid = 0;
 	}
 	mutex_exit(&recent_events_lock);
 }
 
 void
 zfs_ereport_fini(void)
 {
 	recent_events_node_t *entry;
 
 	while ((entry = list_remove_head(&recent_events_list)) != NULL) {
 		avl_remove(&recent_events_tree, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 	avl_destroy(&recent_events_tree);
 	list_destroy(&recent_events_list);
 	mutex_destroy(&recent_events_lock);
 }
 
 void
 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
 {
 	nvlist_t *aux;
 
 	aux = fm_nvlist_create(NULL);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
 }
 
 /*
  * Post when a event when a zvol is created or removed
  *
  * This is currently only used by macOS, since it uses the event to create
  * symlinks between the volume name (mypool/myvol) and the actual /dev
  * device (/dev/disk3).  For example:
  *
  * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
  *
  * name: The full name of the zvol ("mypool/myvol")
  * dev_name: The full /dev name for the zvol ("/dev/disk3")
  * raw_name: The raw  /dev name for the zvol ("/dev/rdisk3")
  */
 void
 zfs_ereport_zvol_post(const char *subclass, const char *name,
     const char *dev_name, const char *raw_name)
 {
 	nvlist_t *aux;
 	char *r;
 
 	boolean_t locked = mutex_owned(&spa_namespace_lock);
 	if (!locked) mutex_enter(&spa_namespace_lock);
 	spa_t *spa = spa_lookup(name);
 	if (!locked) mutex_exit(&spa_namespace_lock);
 
 	if (spa == NULL)
 		return;
 
 	aux = fm_nvlist_create(NULL);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
 	    raw_name);
 	r = strchr(name, '/');
 	if (r && r[1])
 		fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
 }
 
 EXPORT_SYMBOL(zfs_ereport_post);
 EXPORT_SYMBOL(zfs_ereport_is_valid);
 EXPORT_SYMBOL(zfs_ereport_post_checksum);
 EXPORT_SYMBOL(zfs_post_remove);
 EXPORT_SYMBOL(zfs_post_autoreplace);
 EXPORT_SYMBOL(zfs_post_state_change);
 
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
 	"Maximum recent zevents records to retain for duplicate checking");
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
 	"Expiration time for recent zevents records");
 #endif /* _KERNEL */
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 1af357c58006..34be54b337fd 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1,4383 +1,4378 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
- * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
- * the disk(s) by the ZIL after an LWB write has completed. Setting this
- * will cause ZIL corruption on power loss if a volatile out-of-order
- * write cache is enabled.
+ * Disable the flush commands that are normally sent to the disk(s) by the ZIL
+ * after an LWB write has completed. Setting this will cause ZIL corruption on
+ * power loss if a volatile out-of-order write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			ASSERT3U(reclen, <=, end - lrp);
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa = zilog->zl_spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data is embedded into the BP there is no need to create
 		 * BRT entry as there is no data block.  Just copy the BP as it
 		 * contains the data.
 		 */
 		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 			continue;
 
 		/*
 		 * We can not handle block pointers from the future, since they
 		 * are not yet allocated.  It should not normally happen, but
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
 		if (BP_GET_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * Assert the block is really allocated before we reference it.
 		 */
 		metaslab_check_free(spa, bp);
 	}
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
 			brt_pending_add(spa, bp, tx);
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
 	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
- * This function is a called after all vdevs associated with a given lwb
- * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
- * as the lwb write completes, if "zil_nocacheflush" is set. Further,
- * all "previous" lwb's will have completed before this function is
- * called; i.e. this function is called for all previous lwbs before
- * it's called for "this" lwb (enforced via zio the dependencies
- * configured in zil_lwb_set_zio_dependency()).
+ * This function is a called after all vdevs associated with a given lwb write
+ * have completed their flush command; or as soon as the lwb write completes,
+ * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have
+ * completed before this function is called; i.e. this function is called for
+ * all previous lwbs before it's called for "this" lwb (enforced via zio the
+ * dependencies configured in zil_lwb_set_zio_dependency()).
  *
- * The intention is for this function to be called as soon as the
- * contents of an lwb are considered "stable" on disk, and will survive
- * any sudden loss of power. At this point, any threads waiting for the
- * lwb to reach this state are signalled, and the "waiter" structures
- * are marked "done".
+ * The intention is for this function to be called as soon as the contents of
+ * an lwb are considered "stable" on disk, and will survive any sudden loss of
+ * power. At this point, any threads waiting for the lwb to reach this state
+ * are signalled, and the "waiter" structures are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
- * This is called when an lwb's write zio completes. The callback's
- * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
- * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
- * in writing out this specific lwb's data, and in the case that cache
- * flushes have been deferred, vdevs involved in writing the data for
- * previous lwbs. The writes corresponding to all the vdevs in the
- * lwb_vdev_tree will have completed by the time this is called, due to
- * the zio dependencies configured in zil_lwb_set_zio_dependency(),
- * which takes deferred flushes into account. The lwb will be "done"
- * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
- * completion callback for the lwb's root zio.
+ * This is called when an lwb's write zio completes. The callback's purpose is
+ * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The
+ * tree will contain the vdevs involved in writing out this specific lwb's
+ * data, and in the case that cache flushes have been deferred, vdevs involved
+ * in writing the data for previous lwbs. The writes corresponding to all the
+ * vdevs in the lwb_vdev_tree will have completed by the time this is called,
+ * due to the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done" once
+ * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion
+ * callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
-	 * If this lwb does not have any threads waiting for it to
-	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
-	 * command to the vdevs written to by "this" lwb, and instead
-	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
-	 * command for those vdevs. Thus, we merge the vdev tree of
-	 * "this" lwb with the vdev tree of the "next" lwb in the list,
-	 * and assume the "next" lwb will handle flushing the vdevs (or
-	 * deferring the flush(s) again).
+	 * If this lwb does not have any threads waiting for it to complete, we
+	 * want to defer issuing the flush command to the vdevs written to by
+	 * "this" lwb, and instead rely on the "next" lwb to handle the flush
+	 * command for those vdevs. Thus, we merge the vdev tree of "this" lwb
+	 * with the vdev tree of the "next" lwb in the list, and assume the
+	 * "next" lwb will handle flushing the vdevs (or deferring the flush(s)
+	 * again).
 	 *
-	 * This is a useful performance optimization, especially for
-	 * workloads with lots of async write activity and few sync
-	 * write and/or fsync activity, as it has the potential to
-	 * coalesce multiple flush commands to a vdev into one.
+	 * This is a useful performance optimization, especially for workloads
+	 * with lots of async write activity and few sync write and/or fsync
+	 * activity, as it has the potential to coalesce multiple flush
+	 * commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
-	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+	 * how we can defer the flush commands for each lwb.
 	 *
-	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
-	 * lwb will rely on this lwb to flush the vdevs written to by that
-	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
-	 * flush until after the previous lwb's write completes. We ensure
-	 * this ordering by setting the zio parent/child relationship here.
+	 * When the flush commands are deferred, the previous lwb will rely on
+	 * this lwb to flush the vdevs written to by that previous lwb. Thus,
+	 * we need to ensure this lwb doesn't issue the flush until after the
+	 * previous lwb's write completes. We ensure this ordering by setting
+	 * the zio parent/child relationship here.
 	 *
-	 * Without this relationship on the lwb's write zio, it's possible
-	 * for this lwb's write to complete prior to the previous lwb's write
+	 * Without this relationship on the lwb's write zio, it's possible for
+	 * this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Plan splitting of the provided burst size between several blocks.
  */
 static uint_t
 zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 {
 	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
 
 	if (size <= md) {
 		/*
 		 * Small bursts are written as-is in one block.
 		 */
 		*minsize = size;
 		return (size);
 	} else if (size > 8 * md) {
 		/*
 		 * Big bursts use maximum blocks.  The first block size
 		 * is hard to predict, but it does not really matter.
 		 */
 		*minsize = 0;
 		return (md);
 	}
 
 	/*
 	 * Medium bursts try to divide evenly to better utilize several SLOG
 	 * VDEVs.  The first block size we predict assuming the worst case of
 	 * maxing out others.  Fall back to using maximum blocks if due to
 	 * large records or wasted space we can not predict anything better.
 	 */
 	uint_t s = size;
 	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
 	uint_t chunk = DIV_ROUND_UP(s, n);
 	uint_t waste = zil_max_waste_space(zilog);
 	waste = MAX(waste, zilog->zl_cur_max);
 	if (chunk <= md - waste) {
 		*minsize = MAX(s - (md - waste) * (n - 1), waste);
 		return (chunk);
 	} else {
 		*minsize = 0;
 		return (md);
 	}
 }
 
 /*
  * Try to predict next block size based on previous history.  Make prediction
  * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
  * less then 50%, extra writes may cost more, but we don't want single spike
  * to badly affect our predictions.
  */
 static uint_t
 zil_lwb_predict(zilog_t *zilog)
 {
 	uint_t m, o;
 
 	/* If we are in the middle of a burst, take it into account also. */
 	if (zilog->zl_cur_size > 0) {
 		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
 	} else {
 		o = UINT_MAX;
 		m = 0;
 	}
 
 	/* Find minimum optimal size.  We don't need to go below that. */
 	for (int i = 0; i < ZIL_BURSTS; i++)
 		o = MIN(o, zilog->zl_prev_opt[i]);
 
 	/* Find two biggest minimal first block sizes above the optimal. */
 	uint_t m1 = MAX(m, o), m2 = o;
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		m = zilog->zl_prev_min[i];
 		if (m >= m1) {
 			m2 = m1;
 			m1 = m;
 		} else if (m > m2) {
 			m2 = m;
 		}
 	}
 
 	/*
 	 * If second minimum size gives 50% saving -- use it.  It may cost us
 	 * one additional write later, but the space saving is just too big.
 	 */
 	return ((m1 < m2 * 2) ? m1 : m2);
 }
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	uint64_t blksz, plan, plan2;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated.  Here we select the size of the next
 	 * block, based on what's left of this burst and the previous history.
 	 * While we try to only write used part of the block, we can't just
 	 * always allocate the maximum block size because we can exhaust all
 	 * available pool log space, so we try to be reasonable.
 	 */
 	if (zilog->zl_cur_left > 0) {
 		/*
 		 * We are in the middle of a burst and know how much is left.
 		 * But if workload is multi-threaded there may be more soon.
 		 * Try to predict what can it be and plan for the worst case.
 		 */
 		uint_t m;
 		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
 		if (zilog->zl_parallel) {
 			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
 			    zil_lwb_predict(zilog), &m);
 			if (plan < plan2)
 				plan = plan2;
 		}
 	} else {
 		/*
 		 * The previous burst is done and we can only predict what
 		 * will come next.
 		 */
 		plan = zil_lwb_predict(zilog);
 	}
 	blksz = plan + sizeof (zil_chain_t);
 	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
 	blksz = MIN(blksz, zilog->zl_max_block_size);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
 	    uint64_t, plan);
 
 	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
 		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 static uint64_t
 zil_itx_record_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen);
 }
 
 static uint64_t
 zil_itx_data_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 	lr_write_t *lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
 		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
 		    uint64_t));
 	}
 	return (0);
 }
 
 static uint64_t
 zil_itx_full_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen + zil_itx_data_size(itx));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen, >=, sizeof (lr_t));
 	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
 	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
 	/*
 	 * There must be enough space in the log block to hold reclen.
 	 * For WR_COPIED, we need to fit the whole record in one block,
 	 * and reclen is the write record header size + the data size.
 	 * For WR_NEED_COPY, we can create multiple records, splitting
 	 * the data into multiple blocks, so we only need to fit one
 	 * word of data per block; in this case reclen is just the header
 	 * size (no data).
 	 */
 	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0)
 		goto cont;
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	reclen = lr->lrc_reclen;
 	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_null(NULL,
 					    zilog->zl_spa, NULL, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
 	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
 	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
 	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(oitx->itx_size, ==,
 	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
 
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
 	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian_search.ia_foid = oid;
 		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 
 		while (itx != NULL) {
 			uint64_t s = zil_itx_full_size(itx);
 			zilog->zl_cur_size += s;
 			zilog->zl_cur_left += s;
 			s = zil_itx_record_size(itx);
 			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
 			itx = list_next(commit_list, itx);
 		}
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian_search.ia_foid = foid;
 			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
 	    zilog->zl_cur_size == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
 	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
 	zilog->zl_prev_rotor = r;
 	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
 	    &zilog->zl_prev_min[r]);
 
 	zilog->zl_cur_size = 0;
 	zilog->zl_cur_max = 0;
 	zilog->zl_cur_left = 0;
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * If the lwb is still opened, it means the workload is really
 		 * multi-threaded and we won the chance of write aggregation.
 		 * If it is not opened yet, but previous lwb is still not
 		 * flushed, it still means the workload is multi-threaded, but
 		 * there was too much time between the commits to aggregate, so
 		 * we try aggregation next times, but without too much hopes.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED) {
 			zilog->zl_parallel = ZIL_BURSTS;
 		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
 		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
 			zilog->zl_parallel = MAX(zilog->zl_parallel,
 			    ZIL_BURSTS / 2);
 		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
 					zilog->zl_cur_left -=
 					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
 				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
- *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
- *        that are sent to the vdevs upon completion of the lwb zio.
+ *        callback of the root zio for the flush commands that are sent to
+ *        the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
 	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
 	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
 		    sizeof (zil_chain_t);
 	}
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 4aa08f3b30f5..031fc3d5135d 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1,5260 +1,5257 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 #include <cityhash.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 static kmem_cache_t *zio_cache;
 static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
 static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  *
  * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
  * compression (including of metadata).  In practice, we don't have this
  * many sync passes, so this has no effect.
  *
  * The original intent was that disabling compression would help the sync
  * passes to converge. However, in practice disabling compression increases
  * the average number of sync passes, because when we turn compression off, a
  * lot of block's size will change and thus we have to re-allocate (not
  * overwrite) them. It also increases the number of 128KB allocations (e.g.
  * for indirect blocks and spacemaps) because these will not be compressed.
  * The 128K allocations are especially detrimental to performance on highly
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
 
 /* defer frees starting in this pass */
 uint_t zfs_sync_pass_deferred_free = 2;
 
 /* don't compress starting in this pass */
 static uint_t zfs_sync_pass_dont_compress = 8;
 
 /* rewrite new bps starting in this pass */
 static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 /*
  * Enable smaller cores by excluding metadata
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
 static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 static const int zio_buf_debug_limit = 16384;
 #else
 static const int zio_buf_debug_limit = 0;
 #endif
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t align, cflags, data_cflags;
 		char name[32];
 
 		/*
 		 * Create cache for each half-power of 2 size, starting from
 		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
 		 * of ~7/8, sufficient for transient allocations mostly using
 		 * these caches.
 		 */
 		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 		if (!IS_P2ALIGNED(size, p2 / 2))
 			continue;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 
 		if (IS_P2ALIGNED(size, PAGESIZE))
 			align = PAGESIZE;
 		else
 			align = 1 << (highbit64(size ^ (size - 1)) - 1);
 
 		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
 		    KMC_NODEBUG : 0;
 		data_cflags = KMC_NODEBUG;
 		if (cflags == data_cflags) {
 			/*
 			 * Resulting kmem caches would be identical.
 			 * Save memory by creating only one.
 			 */
 			(void) snprintf(name, sizeof (name),
 			    "zio_buf_comb_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size, align,
 			    NULL, NULL, NULL, NULL, NULL, cflags);
 			zio_data_buf_cache[c] = zio_buf_cache[c];
 			continue;
 		}
 		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
 		    (ulong_t)size);
 		zio_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, cflags);
 
 		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
 		    (ulong_t)size);
 		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
 
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	for (size_t i = 0; i < n; i++) {
 		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[i],
 			    (long long unsigned)zio_buf_cache_frees[i]);
 	}
 #endif
 
 	/*
 	 * The same kmem cache can show up multiple times in both zio_buf_cache
 	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
 	 * sort it out.
 	 */
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_buf_cache[j])
 				zio_buf_cache[j] = NULL;
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_data_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		VERIFY3P(zio_buf_cache[i], ==, NULL);
 		VERIFY3P(zio_data_buf_cache[i], ==, NULL);
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 #ifdef ZFS_DEBUG
 static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 #endif
 
 /*
  * Use empty space after the buffer to detect overflows.
  *
  * Since zio_init() creates kmem caches only for certain set of buffer sizes,
  * allocations of different sizes may have some unused space after the data.
  * Filling part of that space with a known pattern on allocation and checking
  * it on free should allow us to detect some buffer overflows.
  */
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 #ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
 #endif
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 #ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t)) {
 		if (unlikely(*canary != zio_buf_canary)) {
 			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
 			    p, size, (canary - p) * sizeof (ulong_t),
 			    *canary, zio_buf_canary);
 		}
 	}
 #endif
 }
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
 	return (p);
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
 	return (p);
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	(void) size;
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, tmp, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
 		abd_return_buf_copy(data, tmp, size);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	spa_t *spa = zio->io_spa;
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			tmp = zio_buf_alloc(lsize);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, tmp, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
 			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			zio_buf_free(tmp, lsize);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
 			ret = zio_handle_decrypt_injection(spa,
 			    &zio->io_bookmark, ot, ECKSUM);
 		}
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but when this was done,
 	 * we had run out of bits in what is now zio_flag_t. Future cleanup
 	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
 			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
 			    zio->io_abd, size, mac);
 			if (zio_injection_enabled && ret == 0) {
 				ret = zio_handle_decrypt_injection(spa,
 				    &zio->io_bookmark, ot, ECKSUM);
 			}
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
 	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
 	    zio->io_abd, &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	ASSERT(MUTEX_HELD(&pio->io_lock));
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
 void
 zio_add_child_first(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	ASSERT(list_is_empty(&cio->io_parent_list));
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
     zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
 		 * so. We only do this if the parent's zio type matches the
 		 * child's type. Otherwise dispatch the parent zio in its
 		 * own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
 		 * overhead, and has no recursion penalty.  Note that one
 		 * read from disk typically causes at least 3 zio's: a
 		 * zio_null(), the logical zio_read(), and then a physical
 		 * zio.  When the physical ZIO completes, we are able to call
 		 * zio_done() on all 3 of these zio's from one invocation of
 		 * zio_execute() by returning the parent back to
 		 * zio_execute().  Since the parent isn't executed until this
 		 * thread returns back to zio_execute(), the caller should do
 		 * so promptly.
 		 *
 		 * In other cases, dispatching the parent prevents
 		 * overflowing the stack when we have deeply nested
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
 		    pio->io_type == zio->io_type) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
 		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT) {
 			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		} else {
 			zio->io_bp = (blkptr_t *)bp;
 		}
 		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 /*
  * ZIO intended to be between others.  Provides synchronization at READY
  * and DONE pipeline stages and calls the respective callbacks.
  */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 /*
  * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
  * READY pipeline stage (is ready on creation), so it should not be used
  * as child of any ZIO that may need waiting for grandchildren READY stage
  * (any other ZIO type).
  */
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
 
 	return (zio);
 }
 
 static int
 zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
     enum blk_verify_flag blk_verify, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("bad blkptr at %px: "
 	    "DVA[0]=%#llx/%#llx "
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
 	    "pad=%#llx,%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
 	    "cksum=%#llx/%#llx/%#llx/%#llx",
 	    bp,
 	    (long long)bp->blk_dva[0].dva_word[0],
 	    (long long)bp->blk_dva[0].dva_word[1],
 	    (long long)bp->blk_dva[1].dva_word[0],
 	    (long long)bp->blk_dva[1].dva_word[1],
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
 	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
 	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
 	    (long long)bp->blk_cksum.zc_word[2],
 	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
 		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_ONLY:
 		break;
 	}
 
 	return (1);
 }
 
 /*
  * Verify the block pointer fields contain reasonable values.  This means
  * it only contains known object types, checksum/compression identifiers,
  * block sizes within the maximum allowed limits, valid DVAs, etc.
  *
  * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
  * Values for blk_verify_flag:
  *   BLK_VERIFY_ONLY: evaluate the block
  *   BLK_VERIFY_LOG: evaluate the block and log problems
  *   BLK_VERIFY_HALT: call zfs_panic_recover on error
  *
  * Values for blk_config_flag:
  *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
  *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
  *   obtained for reader
  *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
  *   performance
  */
 boolean_t
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Do not verify individual DVAs if the config is not trusted. This
 	 * will be done once the zio is executed in vdev_mirror_map_alloc.
 	 */
 	if (!spa->spa_trust_config)
 		return (errors == 0);
 
 	switch (blk_config) {
 	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
 		break;
 	case BLK_CONFIG_NEEDED:
 		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
 		break;
 	case BLK_CONFIG_SKIP:
 		return (errors == 0);
 	default:
 		panic("invalid blk_config %u", blk_config);
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the logical birth
 	 * and physical birth are not too large.  However,
 	 * spa_freeze() allows the birth time of log blocks (and
 	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
 	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		uint64_t vdevid = DVA_GET_VDEV(dva);
 
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(dva);
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		if (DVA_GET_GANG(dva))
 			asize = vdev_gang_header_asize(vd);
 		if (offset + asize > vd->vdev_asize) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 	if (blk_config == BLK_CONFIG_NEEDED)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors == 0);
 }
 
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
 	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
 		return (B_FALSE);
 
 	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 	if (vd == NULL)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_hole_ops)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_missing_ops) {
 		return (B_FALSE);
 	}
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = DVA_GET_ASIZE(dva);
 
 	if (DVA_GET_GANG(dva))
 		asize = vdev_gang_header_asize(vd);
 	if (offset + asize > vd->vdev_asize)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
     boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 *
 	 * Note that we only defer frees after zfs_sync_pass_deferred_free
 	 * when the log space map feature is disabled. [see relevant comment
 	 * in spa_sync_iterate_to_convergence()]
 	 */
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
 	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
 	}
 }
 
 /*
  * To improve performance, this function may return NULL if we were able
  * to do the free immediately.  This avoids the cost of creating a zio
  * (and linking it to the parent, etc).
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (NULL);
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    brt_maybe_exists(spa, bp)) {
 		/*
 		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
 		 * block header, the DDT or the BRT), so issue them
 		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
 
 		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 		    BP_GET_PSIZE(bp), NULL, NULL,
 		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
 	} else {
 		metaslab_free(spa, bp, txg, B_FALSE);
 		return (NULL);
 	}
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
 	ASSERT0(vd->vdev_children);
 	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	ASSERT3U(size, !=, 0);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
 	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
 	zio->io_trim_flags = trim_flags;
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	/*
 	 * vdev child I/Os do not propagate their error to the parent.
 	 * Therefore, for correct operation the caller *must* check for
 	 * and handle the error in the child i/o's done callback.
 	 * The only exceptions are i/os that we don't care about
 	 * (OPTIONAL or REPAIR).
 	 */
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		ASSERT0(vd->vdev_children);
 		offset += VDEV_LABEL_START_SIZE;
 	}
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 
 /*
  * Send a flush command to the given vdev. Unlike most zio creation functions,
  * the flush zios are issued immediately. You can wait on pio to pause until
  * the flushes complete.
  */
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
 	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY;
 
 	if (vd->vdev_nowritecache)
 		return;
 
 	if (vd->vdev_children == 0) {
-		zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
+		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
 		    NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0,
-		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
-		zio->io_cmd = DKIOCFLUSHWRITECACHE;
-		zio_nowait(zio);
+		    NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
 	}
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * Round provided allocation size up to a value that can be allocated
  * by at least some vdev(s) in the pool with minimum or no additional
  * padding and without extra space usage on others
  */
 static uint64_t
 zio_roundup_alloc_size(spa_t *spa, uint64_t size)
 {
 	if (size > spa->spa_min_alloc)
 		return (roundup(size, spa->spa_gcd_alloc));
 	return (spa->spa_min_alloc);
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zp->zp_brtwrite)
 			return (zio);
 
 		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
 
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (zio);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
 		    MIN(zp->zp_copies, spa_max_replication(spa))
 		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		void *cbuf = NULL;
 		psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize,
 		    zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			if (cbuf != NULL)
 				zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (zio);
 		} else {
 			/*
 			 * Round compressed size up to the minimum allocation
 			 * size of the smallest-ashift device, and zero the
 			 * tail. This ensures that the compressed size of the
 			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
 				abd_take_ownership_of_buf(cdata, B_TRUE);
 				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
 	    zp->zp_type == DMU_OT_DNODE) {
 		/*
 		 * The DMU actually relies on the zio layer's compression
 		 * to free metadnode blocks that have had all contained
 		 * dnodes freed. As a result, even when doing a raw
 		 * receive, we must check whether the block can be compressed
 		 * to a hole.
 		 */
 		psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
 		    zio->io_abd, NULL, lsize, zp->zp_complevel);
 		if (psize == 0 || psize >= lsize)
 			compress = ZIO_COMPRESS_OFF;
 	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
 		/*
 		 * If we are raw receiving an encrypted dataset we should not
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
 		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
 		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
 			abd_zero_off(cdata, psize, rounded - psize);
 			abd_copy_off(cdata, zio->io_abd, 0, 0, psize);
 			psize = rounded;
 			zio_push_transform(zio, cdata,
 			    psize, rounded, NULL);
 		}
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
 	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 	spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags,
 	    &zio->io_tqent, zio);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	spa_t *spa = zio->io_spa;
 
 	taskq_t *tq = taskq_of_curthread();
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (tqs->stqs_taskq[i] == tq)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 	return (NULL);
 }
 
 void
 zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			clock_t expire_at_tick = ddi_get_lbolt() +
 			    NSEC_TO_TICK(diff);
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			if (NSEC_TO_TICK(diff) == 0) {
 				/* Our delay is less than a jiffy - just spin */
 				zfs_sleep_until(zio->io_target_timestamp);
 				zio_interrupt(zio);
 			} else {
 				/*
 				 * Use taskq_dispatch_delay() in the place of
 				 * OpenZFS's timeout_generic().
 				 */
 				tid = taskq_dispatch_delay(system_taskq,
 				    zio_interrupt, zio, TQ_NOSLEEP,
 				    expire_at_tick);
 				if (tid == TASKQID_INVALID) {
 					/*
 					 * Couldn't allocate a task.  Just
 					 * finish the zio without a delay.
 					 */
 					zio_interrupt(zio);
 				}
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 static void
 zio_deadman_impl(zio_t *pio, int ziodepth)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 	vdev_t *vd = pio->io_vd;
 
 	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
 		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
 		zbookmark_phys_t *zb = &pio->io_bookmark;
 		uint64_t delta = gethrtime() - pio->io_timestamp;
 		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
 
 		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
 		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
 		    "offset=%llu size=%llu "
 		    "error=%d",
 		    ziodepth, pio, pio->io_timestamp,
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
 		    pio->io_priority, (u_longlong_t)pio->io_flags,
 		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
 		    pio->io_error);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
 		    pio->io_spa, vd, zb, pio, 0);
 
 		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
 		    taskq_empty_ent(&pio->io_tqent)) {
 			zio_interrupt(pio);
 		}
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_deadman_impl(cio, ziodepth + 1);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Log the critical information describing this zio and all of its children
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
 zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
 
 	if (!zfs_deadman_enabled || spa_suspended(spa))
 		return;
 
 	zio_deadman_impl(pio, 0);
 
 	switch (spa_get_deadman_failmode(spa)) {
 	case ZIO_FAILURE_MODE_WAIT:
 		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_CONTINUE:
 		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_PANIC:
 		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
 		break;
 	}
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 static boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #else
 	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 
 		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 
 		/*
 		 * The zio pipeline stage returns the next zio to execute
 		 * (typically the same as this one), or NULL if we should
 		 * stop.
 		 */
 		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (zio == NULL)
 			return;
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	/*
 	 * Some routines, like zio_free_sync(), may return a NULL zio
 	 * to avoid the performance overhead of creating and then destroying
 	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
 	 * zio and ignore it.
 	 */
 	if (zio == NULL)
 		return (0);
 
 	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;
 
 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL) {
 		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
 		    ddi_get_lbolt() + timeout);
 
 		if (zfs_deadman_enabled && error == -1 &&
 		    gethrtime() - zio->io_queued_timestamp >
 		    spa_deadman_ziotime(zio->io_spa)) {
 			mutex_exit(&zio->io_lock);
 			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
 			zio_deadman(zio, FTAG);
 			mutex_enter(&zio->io_lock);
 		}
 	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	/*
 	 * See comment in zio_wait().
 	 */
 	if (zio == NULL)
 		return;
 
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
 	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
 	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
 	zio_link_t *zl = NULL;
 	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
 			gio->io_children[pio->io_child_type][w] +=
 			    !pio->io_state[w];
 		}
 	}
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
 	}
 	mutex_exit(&pio->io_lock);
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
 	    "failure and has been suspended.\n", spa_name(spa));
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = reason;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = ZIO_SUSPEND_NONE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_free(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
 		zio = zio_null(pio, pio->io_spa,
 		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (zio);
 }
 
 static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
 	cio->io_wr_iss_tq = pio->io_wr_iss_tq;
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	zio_t *gio __maybe_unused = zio->io_gang_leader;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	/*
 	 * The io_abd field will be NULL for a zio with no data.  The io_flags
 	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
 	 * check for it here as it is cleared in zio_ready.
 	 */
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
 	 * If one copy was requested, store 2 copies of the GBH, so that we
 	 * can still traverse all the data (e.g. to free or scrub) even if a
 	 * block is damaged.  Note that we can't store 3 copies of the GBH in
 	 * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
 	 */
 	int gbh_copies = copies;
 	if (gbh_copies == 1) {
 		gbh_copies = MIN(2, spa_max_replication(spa));
 	}
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
 		    mca_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio->io_allocator, pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio, pio->io_allocator);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio->io_allocator, pio);
 		}
 
 		pio->io_error = error;
 		return (pio);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_complevel = gio->io_prop.zp_complevel;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
 		    resid) : NULL, lsize, lsize, &zp,
 		    zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		zio_gang_inherit_allocator(zio, cio);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(has_data);
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio->io_allocator, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (pio);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
 		 * indirect vdev, then ignore the nopwrite request and
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
 		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
 			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
 			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
 			if (tvd->vdev_ops == &vdev_indirect_ops) {
 				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 				return (zio);
 			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Block Reference Table
  * ==========================================================================
  */
 static zio_t *
 zio_brt_free(zio_t *zio)
 {
 	blkptr_t *bp;
 
 	bp = zio->io_bp;
 
 	if (BP_GET_LEVEL(bp) > 0 ||
 	    BP_IS_METADATA(bp) ||
 	    !brt_maybe_exists(zio->io_spa, bp)) {
 		return (zio);
 	}
 
 	if (!brt_entry_decref(zio->io_spa, bp)) {
 		/*
 		 * This isn't the last reference, so we cannot free
 		 * the data yet.
 		 */
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
 	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
 		dde->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (zio);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (zio);
 }
 
 static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (zio);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 		} else if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	zio_link_t *zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
 		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
 
 	return (zio);
 }
 
 static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}
 	ddt_exit(ddt);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
 
 	zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	ASSERT3U(zio->io_allocator, ==, allocator);
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
 	    zio->io_prop.zp_copies, allocator, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 	metaslab_class_t *mc;
 
 	/* locate an appropriate allocation class */
 	mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
 	    zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !mc->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (zio);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	int allocator = zio->io_allocator;
 	zio->io_metaslab_class = mc;
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
 	nio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	return (nio);
 }
 
 static void
 zio_allocate_dispatch(spa_t *spa, int allocator)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
 	zio = zio_io_to_allocate(spa, allocator);
 	mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_NODATA)
 		flags |= METASLAB_DONT_THROTTLE;
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	/*
 	 * if not already chosen, locate an appropriate allocation class
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL) {
 		mc = spa_preferred_class(spa, zio->io_size,
 		    zio->io_prop.zp_type, zio->io_prop.zp_level,
 		    zio->io_prop.zp_zpl_smallblk);
 		zio->io_metaslab_class = mc;
 	}
 
 	/*
 	 * Try allocating the block in the usual metaslab class.
 	 * If that's full, allocate it in the normal class.
 	 * If that's full, allocate as a gang block,
 	 * and if all are full, the allocation fails (which shouldn't happen).
 	 *
 	 * Note that we do not fall back on embedded slog (ZIL) space, to
 	 * preserve unfragmented slog space, which is critical for decent
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio, zio->io_allocator);
 
 	/*
 	 * Fallback to normal class when an alloc class is full
 	 */
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		/*
 		 * If throttling, transfer reservation over to normal class.
 		 * The io_allocator slot can remain the same even though we
 		 * are switching classes.
 		 */
 		if (mc->mc_alloc_throttle_enabled &&
 		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
 			metaslab_class_throttle_unreserve(mc,
 			    zio->io_prop.zp_copies, zio->io_allocator, zio);
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 
 			VERIFY(metaslab_class_throttle_reserve(
 			    spa_normal_class(spa),
 			    zio->io_prop.zp_copies, zio->io_allocator, zio,
 			    flags | METASLAB_MUST_RESERVE));
 		}
 		zio->io_metaslab_class = mc = spa_normal_class(spa);
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying normal class: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 
 		error = metaslab_alloc(spa, mc, zio->io_size, bp,
 		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 		    &zio->io_alloc_list, zio, zio->io_allocator);
 	}
 
 	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying ganging: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		return (zio_write_gang_block(zio, mc));
 	}
 	if (error != 0) {
 		if (error != ENOSPC ||
 		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
 			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
 			    "size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_error = error;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (zio);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp)) {
 		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
 		    B_TRUE);
 	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 
 	/*
 	 * Block pointer fields are useful to metaslabs for stats and debugging.
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 	BP_SET_PSIZE(new_bp, size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
 	 * When allocating a zil block, we don't have information about
 	 * the final destination of the block except the objset it's part
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
 	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash4(0, 0, 0,
 	    os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
 	*slog = (error == 0);
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, flags,
 		    &io_alloc_list, NULL, allocator);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
 		    error);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (NULL);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(spa->spa_trust_config);
 
 		/*
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering.
 	 *
 	 * There are a few ways that we can end up creating these spurious
 	 * resilver i/os:
 	 *
 	 * 1. A resilver i/o will be issued if any DVA in the BP has a
 	 * dirty DTL.  The mirror code will issue resilver writes to
 	 * each DVA, including the one(s) that are not on vdevs with dirty
 	 * DTLs.
 	 *
 	 * 2. With nested replication, which happens when we have a
 	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
 	 * For example, given mirror(replacing(A+B), C), it's likely that
 	 * only A is out of date (it's the new device). In this case, we'll
 	 * read from C, then use the data to resilver A+B -- but we don't
 	 * actually want to resilver B, just A. The top-level mirror has no
 	 * way to know this, so instead we just discard unnecessary repairs
 	 * as we work our way down the vdev tree.
 	 *
 	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
 	 * The same logic applies to any form of nested replication: ditto
 	 * + mirror, RAID-Z + replacing, etc.
 	 *
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
 	 *
 	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
 	 * a dRAID spare vdev. For example, when a dRAID spare is first
 	 * used, its spare blocks need to be written to but the leaf vdev's
 	 * of such blocks can have empty DTL_PARTIAL.
 	 *
 	 * There seemed no clean way to allow such writes while bypassing
 	 * spurious ones. At this point, just avoid all bypassing for dRAID
 	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
 	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}
 
 	/*
 	 * Select the next best leaf I/O to process.  Distributed spares are
 	 * excluded since they dispatch the I/O directly to a leaf vdev after
 	 * applying the dRAID mapping.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (NULL);
 		}
 		zio->io_delay = gethrtime();
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (NULL);
 }
 
 static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_IOCTL ||
 	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		if (zio->io_type != ZIO_TYPE_IOCTL)
 			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
 }
 
 /*
  * This function is used to change the priority of an existing zio that is
  * currently in-flight. This is used by the arc to upgrade priority in the
  * event that a demand read is made for a block that is currently queued
  * as a scrub or async read IO. Otherwise, the high priority read request
  * would end up having to wait for the lower priority IO.
  */
 void
 zio_change_priority(zio_t *pio, zio_priority_t priority)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_change_io_priority(pio, priority);
 	} else {
 		pio->io_priority = priority;
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_change_priority(cio, priority);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (NULL);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
 		    "cant_write=TRUE due to write failure with ENXIO",
 		    zio);
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_IOCTL &&
-	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+	    zio->io_type == ZIO_TYPE_IOCTL && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static zio_t *
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (zio);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (zio);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (zio);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		ASSERT0(BP_GET_LEVEL(bp));
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 
 		/* dnode blocks must be written out in the provided byteorder */
 		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
 		    ot == DMU_OT_DNODE) {
 			void *bswap_buf = zio_buf_alloc(psize);
 			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
 
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
 			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
 			    psize);
 
 			abd_take_ownership_of_buf(babd, B_TRUE);
 			zio_push_transform(zio, babd, psize, psize, NULL);
 		}
 
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (zio);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
 		return (zio);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this constitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
 	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (zio);
 }
 
 static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			mutex_enter(&zio->io_vd->vdev_stat_lock);
 			zio->io_vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&zio->io_vd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, &zio->io_bookmark, zio,
 			    zio->io_offset, zio->io_size, &info);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
 		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 #ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 #endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
 			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    zio->io_metaslab_class, zio->io_prop.zp_copies,
 			    zio->io_allocator, zio);
 			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (zio);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *lio __maybe_unused = zio->io_logical;
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 	ASSERT(zio->io_metaslab_class != NULL);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
 	    pio->io_allocator, B_TRUE);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
 	    pio->io_allocator, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
 }
 
 static zio_t *
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recursively up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(zio->io_metaslab_class != NULL);
 		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(zio->io_bp != NULL);
 		ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
 		    zio->io_allocator);
 		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
 		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
 	}
 
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (adata != NULL && asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL && asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
 			/*
 			 * We want to only increment our slow IO counters if
 			 * the IO is valid (i.e. not if the drive is removed).
 			 *
 			 * zfs_ereport_post() will also do these checks, but
 			 * it can also ratelimit and have other failures, so we
 			 * need to increment the slow_io counters independent
 			 * of it.
 			 */
 			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
 			    zio->io_spa, zio->io_vd, zio)) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
 				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
 				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
 				    zio, 0);
 			}
 		}
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				if (zio->io_type == ZIO_TYPE_READ)
 					zio->io_vd->vdev_stat.vs_read_errors++;
 				else if (zio->io_type == ZIO_TYPE_WRITE)
 					zio->io_vd->vdev_stat.vs_write_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 			}
 		}
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
 				 * bother with "next_to_execute".
 				 */
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
 				    NULL);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			/*
 			 * This is a rare code path, so we don't bother with
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
 			spa_taskq_dispatch_ent(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    zio_reexecute, zio, 0, &zio->io_tqent, NULL);
 		}
 		return (NULL);
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * We are done executing this zio.  We may want to execute a parent
 	 * next.  See the comment in zio_notify_parent().
 	 */
 	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (next_to_execute);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
 	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 /*
  * This function is similar to zbookmark_subtree_completed(), but returns true
  * if subtree_root is equal or ahead of last_block, i.e. still to be done.
  */
 boolean_t
 zbookmark_subtree_tbd(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	ASSERT0(last_block->zb_level);
 	if (dnp == NULL)
 		return (B_FALSE);
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
 	    last_block) >= 0);
 }
 
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 	"Throttle block allocations in the ZIO pipeline");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");