diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 766d582c0b51..1bfa44a3884d 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -1,146 +1,152 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #ifndef _SYS_METASLAB_H
 #define	_SYS_METASLAB_H
 
 #include <sys/spa.h>
 #include <sys/space_map.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 
 typedef struct metaslab_ops {
 	const char *msop_name;
-	uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
+	uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *);
 } metaslab_ops_t;
 
 
 extern const metaslab_ops_t zfs_metaslab_ops;
 
 int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
 void metaslab_set_unflushed_dirty(metaslab_t *, boolean_t);
 void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *);
 void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *);
 boolean_t metaslab_unflushed_dirty(metaslab_t *);
 uint64_t metaslab_unflushed_txg(metaslab_t *);
 uint64_t metaslab_estimated_condensed_size(metaslab_t *);
 int metaslab_sort_by_flushed(const void *, const void *);
 void metaslab_unflushed_bump(metaslab_t *, dmu_tx_t *, boolean_t);
 uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
 
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);
 boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
 
 uint64_t metaslab_allocated_space(metaslab_t *);
 
 void metaslab_sync(metaslab_t *, uint64_t);
 void metaslab_sync_done(metaslab_t *, uint64_t);
 void metaslab_sync_reassess(metaslab_group_t *);
 uint64_t metaslab_largest_allocatable(metaslab_t *);
 
 /*
  * metaslab alloc flags
  */
 #define	METASLAB_ZIL			0x1
 #define	METASLAB_GANG_HEADER		0x2
 #define	METASLAB_GANG_CHILD		0x4
 #define	METASLAB_ASYNC_ALLOC		0x8
 
 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
     uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
+int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t,
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *,
+    int, const void *, uint64_t *);
 int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
     dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
 void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
 void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t);
 void metaslab_free_dva(spa_t *, const dva_t *, boolean_t);
 void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *);
 void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
 int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
 int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
 void metaslab_check_free(spa_t *, const blkptr_t *);
 
 void metaslab_stat_init(void);
 void metaslab_stat_fini(void);
+void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *);
 void metaslab_trace_init(zio_alloc_list_t *);
 void metaslab_trace_fini(zio_alloc_list_t *);
 
 metaslab_class_t *metaslab_class_create(spa_t *, const metaslab_ops_t *,
     boolean_t);
 void metaslab_class_destroy(metaslab_class_t *);
 void metaslab_class_validate(metaslab_class_t *);
 void metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync);
 void metaslab_class_histogram_verify(metaslab_class_t *);
 uint64_t metaslab_class_fragmentation(metaslab_class_t *);
 uint64_t metaslab_class_expandable_space(metaslab_class_t *);
 boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, zio_t *,
     boolean_t, boolean_t *);
 boolean_t metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
 void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
 uint64_t metaslab_class_get_alloc(metaslab_class_t *);
 uint64_t metaslab_class_get_space(metaslab_class_t *);
 uint64_t metaslab_class_get_dspace(metaslab_class_t *);
 uint64_t metaslab_class_get_deferred(metaslab_class_t *);
 
 void metaslab_space_update(vdev_t *, metaslab_class_t *,
     int64_t, int64_t, int64_t);
 
 metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
 void metaslab_group_destroy(metaslab_group_t *);
 void metaslab_group_activate(metaslab_group_t *);
 void metaslab_group_passivate(metaslab_group_t *);
 boolean_t metaslab_group_initialized(metaslab_group_t *);
 uint64_t metaslab_group_get_space(metaslab_group_t *);
 void metaslab_group_histogram_verify(metaslab_group_t *);
 uint64_t metaslab_group_fragmentation(metaslab_group_t *);
 void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int,
+    uint64_t, const void *);
 void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
     const void *);
 void metaslab_recalculate_weight_and_sort(metaslab_t *);
 void metaslab_disable(metaslab_t *);
 void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
 void metaslab_set_selected_txg(metaslab_t *, uint64_t);
 
 extern int metaslab_debug_load;
 
 zfs_range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev,
     metaslab_t *msp, uint64_t *start, uint64_t *shift);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_METASLAB_H */
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 5aad22dba6b3..7f457c3a0b76 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -1,235 +1,237 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
 #define	_SYS_VDEV_H
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/metaslab.h>
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef enum vdev_dtl_type {
 	DTL_MISSING,	/* 0% replication: no copies of the data */
 	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
 	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
 	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
 	DTL_TYPES
 } vdev_dtl_type_t;
 
 extern int zfs_nocacheflush;
 
 typedef boolean_t vdev_open_children_func_t(vdev_t *vd);
 
 extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
     __attribute__((format(printf, 2, 3)));
 extern void vdev_dbgmsg_print_tree(vdev_t *, int);
 extern int vdev_open(vdev_t *);
 extern void vdev_open_children(vdev_t *);
 extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *);
 extern int vdev_validate(vdev_t *);
 extern int vdev_copy_path_strict(vdev_t *, vdev_t *);
 extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_reopen(vdev_t *);
 extern int vdev_validate_aux(vdev_t *vd);
 extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_concrete(vdev_t *vd);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
 extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done);
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
     dmu_tx_t *tx);
 extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx);
 extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
     uint64_t size);
 extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
     uint64_t offset, uint64_t size, dmu_tx_t *tx);
 extern boolean_t vdev_replace_in_progress(vdev_t *vdev);
 
 extern void vdev_hold(vdev_t *);
 extern void vdev_rele(vdev_t *);
 
 void vdev_update_nonallocating_space(vdev_t *vd, boolean_t add);
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
 extern void vdev_deadman(vdev_t *vd, const char *tag);
 
 typedef void vdev_xlate_func_t(void *arg, zfs_range_seg64_t *physical_rs);
 
 extern boolean_t vdev_xlate_is_empty(zfs_range_seg64_t *rs);
 extern void vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
 extern void vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg);
 
 extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx);
 
 extern metaslab_group_t *vdev_get_mg(vdev_t *vd, metaslab_class_t *mc);
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 extern boolean_t vdev_children_are_offline(vdev_t *vd);
 
 extern void vdev_space_update(vdev_t *vd,
     int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
 
+extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
+    uint64_t txg);
 extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
     uint64_t txg);
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
 /*
  * Return the amount of space allocated for a gang block header.  Note that
  * since the physical birth txg is not provided, this must be constant for
  * a given vdev.  (e.g. raidz expansion can't change this)
  */
 static inline uint64_t
 vdev_gang_header_asize(vdev_t *vd)
 {
 	return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
 }
 
 extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
 extern int vdev_remove_wanted(spa_t *spa, uint64_t guid);
 extern void vdev_clear(spa_t *spa, vdev_t *vd);
 
 extern boolean_t vdev_is_dead(vdev_t *vd);
 extern boolean_t vdev_readable(vdev_t *vd);
 extern boolean_t vdev_writeable(vdev_t *vd);
 extern boolean_t vdev_allocatable(vdev_t *vd);
 extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
 extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
 
 extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
 extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 
 extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
 extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
 extern boolean_t vdev_queue_pool_busy(spa_t *spa);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
 extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
 extern void vdev_defer_resilver(vdev_t *vd);
 extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx);
 
 typedef enum vdev_config_flag {
 	VDEV_CONFIG_SPARE = 1 << 0,
 	VDEV_CONFIG_L2CACHE = 1 << 1,
 	VDEV_CONFIG_MOS = 1 << 2,
 	VDEV_CONFIG_MISSING = 1 << 3
 } vdev_config_flag_t;
 
 extern void vdev_post_kobj_evt(vdev_t *vd);
 extern void vdev_clear_kobj_evt(vdev_t *vd);
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
     boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
  */
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
 extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
 extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
 extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
     offset, uint64_t size, zio_done_func_t *done, void *priv, int flags);
 extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
 extern int vdev_uberblock_sync_list(vdev_t **, int, struct uberblock *, int);
 extern int vdev_check_boot_reserve(spa_t *, vdev_t *);
 
 typedef enum {
 	VDEV_LABEL_CREATE,	/* create/add a new device */
 	VDEV_LABEL_REPLACE,	/* replace an existing device */
 	VDEV_LABEL_SPARE,	/* add a new hot spare */
 	VDEV_LABEL_REMOVE,	/* remove an existing device */
 	VDEV_LABEL_L2CACHE,	/* add an L2ARC cache device */
 	VDEV_LABEL_SPLIT	/* generating new label for split-off dev */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
 
 extern int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl);
 extern int vdev_prop_get(vdev_t *vd, nvlist_t *nvprops, nvlist_t *outnvl);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VDEV_H */
diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h
index d44ab6681db9..e923092a39ad 100644
--- a/include/sys/vdev_draid.h
+++ b/include/sys/vdev_draid.h
@@ -1,112 +1,112 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2016, Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  */
 
 #ifndef _SYS_VDEV_DRAID_H
 #define	_SYS_VDEV_DRAID_H
 
 #include <sys/types.h>
 #include <sys/abd.h>
 #include <sys/nvpair.h>
 #include <sys/zio.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev.h>
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
 /*
  * Constants required to generate and use dRAID permutations.
  */
 #define	VDEV_DRAID_SEED			0xd7a1d5eed
 #define	VDEV_DRAID_MAX_MAPS		254
 #define	VDEV_DRAID_ROWSHIFT		SPA_MAXBLOCKSHIFT
 #define	VDEV_DRAID_ROWHEIGHT		(1ULL << VDEV_DRAID_ROWSHIFT)
 #define	VDEV_DRAID_REFLOW_RESERVE	(2 * VDEV_DRAID_ROWHEIGHT)
 
 /*
  * dRAID permutation map.
  */
 typedef struct draid_map {
 	uint64_t dm_children;	/* # of permutation columns */
 	uint64_t dm_nperms;	/* # of permutation rows */
 	uint64_t dm_seed;	/* dRAID map seed */
 	uint64_t dm_checksum;	/* Checksum of generated map */
 	uint8_t *dm_perms;	/* base permutation array */
 } draid_map_t;
 
 /*
  * dRAID configuration.
  */
 typedef struct vdev_draid_config {
 	/*
 	 * Values read from the dRAID nvlist configuration.
 	 */
 	uint64_t vdc_ndata;		/* # of data devices in group */
 	uint64_t vdc_nparity;		/* # of parity devices in group */
 	uint64_t vdc_nspares;		/* # of distributed spares */
 	uint64_t vdc_children;		/* # of children */
 	uint64_t vdc_ngroups;		/* # groups per slice */
 
 	/*
 	 * Immutable derived constants.
 	 */
 	uint8_t *vdc_perms;		/* permutation array */
 	uint64_t vdc_nperms;		/* # of permutations */
 	uint64_t vdc_groupwidth;	/* = data + parity */
 	uint64_t vdc_ndisks;		/* = children - spares */
 	uint64_t vdc_groupsz;		/* = groupwidth * DRAID_ROWSIZE */
 	uint64_t vdc_devslicesz;	/* = (groupsz * groups) / ndisks */
 } vdev_draid_config_t;
 
 /*
  * Functions for handling dRAID permutation maps.
  */
 extern uint64_t vdev_draid_rand(uint64_t *);
 extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **);
 extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **);
 
 /*
  * General dRAID support functions.
  */
 extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
 extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
-extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t);
+extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t);
 extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
 extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *);
 extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);
 
 /* Functions for dRAID distributed spares. */
 extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t);
 extern vdev_t *vdev_draid_spare_get_parent(vdev_t *);
 extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t);
 
 #ifdef  __cplusplus
 }
 #endif
 
 #endif /* _SYS_VDEV_DRAID_H */
diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index a2a3e25d14cc..385d7224f2c5 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -1,668 +1,670 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2023, Klara Inc.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
 #define	_SYS_VDEV_IMPL_H
 
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/metaslab.h>
 #include <sys/nvpair.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_removal.h>
 #include <sys/zfs_ratelimit.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Virtual device descriptors.
  *
  * All storage pool operations go through the virtual device framework,
  * which provides data replication and I/O scheduling.
  */
 
 /*
  * Forward declarations that lots of things need.
  */
 typedef struct vdev_queue vdev_queue_t;
 struct abd;
 
 /*
  * Virtual device operations
  */
 typedef int	vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd);
 typedef void	vdev_kobj_post_evt_func_t(vdev_t *vd);
 typedef void	vdev_fini_func_t(vdev_t *vd);
 typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
     uint64_t *ashift, uint64_t *pshift);
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg);
 typedef uint64_t vdev_min_asize_func_t(vdev_t *vd);
 typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd);
 typedef void	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
 typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva,
     size_t psize, uint64_t phys_birth);
 typedef void	vdev_hold_func_t(vdev_t *vd);
 typedef void	vdev_rele_func_t(vdev_t *vd);
 
 typedef void	vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd,
     uint64_t offset, uint64_t size, void *arg);
 typedef void	vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
     vdev_remap_cb_t callback, void *arg);
 /*
  * Given a target vdev, translates the logical range "in" to the physical
  * range "res"
  */
 typedef void vdev_xlation_func_t(vdev_t *cvd, const zfs_range_seg64_t *logical,
     zfs_range_seg64_t *physical, zfs_range_seg64_t *remain);
 typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start,
     uint64_t size, uint64_t max_segment);
 typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp,
     uint64_t *sizep);
 typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv);
 typedef uint64_t vdev_nparity_func_t(vdev_t *vd);
 typedef uint64_t vdev_ndisks_func_t(vdev_t *vd);
 
 typedef const struct vdev_ops {
 	vdev_init_func_t		*vdev_op_init;
 	vdev_fini_func_t		*vdev_op_fini;
 	vdev_open_func_t		*vdev_op_open;
 	vdev_close_func_t		*vdev_op_close;
-	vdev_asize_func_t		*vdev_op_asize;
+	vdev_asize_func_t		*vdev_op_psize_to_asize;
+	vdev_asize_func_t		*vdev_op_asize_to_psize;
 	vdev_min_asize_func_t		*vdev_op_min_asize;
 	vdev_min_alloc_func_t		*vdev_op_min_alloc;
 	vdev_io_start_func_t		*vdev_op_io_start;
 	vdev_io_done_func_t		*vdev_op_io_done;
 	vdev_state_change_func_t	*vdev_op_state_change;
 	vdev_need_resilver_func_t	*vdev_op_need_resilver;
 	vdev_hold_func_t		*vdev_op_hold;
 	vdev_rele_func_t		*vdev_op_rele;
 	vdev_remap_func_t		*vdev_op_remap;
 	vdev_xlation_func_t		*vdev_op_xlate;
 	vdev_rebuild_asize_func_t	*vdev_op_rebuild_asize;
 	vdev_metaslab_init_func_t	*vdev_op_metaslab_init;
 	vdev_config_generate_func_t	*vdev_op_config_generate;
 	vdev_nparity_func_t		*vdev_op_nparity;
 	vdev_ndisks_func_t		*vdev_op_ndisks;
 	vdev_kobj_post_evt_func_t	*vdev_op_kobj_evt_post;
 	char				vdev_op_type[16];
 	boolean_t			vdev_op_leaf;
 } vdev_ops_t;
 
 /*
  * Virtual device properties
  */
 typedef union vdev_queue_class {
 	struct {
 		ulong_t 	vqc_list_numnodes;
 		list_t		vqc_list;
 	};
 	avl_tree_t	vqc_tree;
 } vdev_queue_class_t;
 
 struct vdev_queue {
 	vdev_t		*vq_vdev;
 	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
 	avl_tree_t	vq_read_offset_tree;
 	avl_tree_t	vq_write_offset_tree;
 	uint64_t	vq_last_offset;
 	zio_priority_t	vq_last_prio;	/* Last sent I/O priority. */
 	uint32_t	vq_cqueued;	/* Classes with queued I/Os. */
 	uint32_t	vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
 	uint32_t	vq_active;	/* Number of active I/Os. */
 	uint32_t	vq_ia_active;	/* Active interactive I/Os. */
 	uint32_t	vq_nia_credit;	/* Non-interactive I/Os credit. */
 	list_t		vq_active_list;	/* List of active I/Os. */
 	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	hrtime_t	vq_io_delta_ts;
 	zio_t		vq_io_search; /* used as local for stack reduction */
 	kmutex_t	vq_lock;
 };
 
 typedef enum vdev_alloc_bias {
 	VDEV_BIAS_NONE,
 	VDEV_BIAS_LOG,		/* dedicated to ZIL data (SLOG) */
 	VDEV_BIAS_SPECIAL,	/* dedicated to ddt, metadata, and small blks */
 	VDEV_BIAS_DEDUP		/* dedicated to dedup metadata */
 } vdev_alloc_bias_t;
 
 
 /*
  * On-disk indirect vdev state.
  *
  * An indirect vdev is described exclusively in the MOS config of a pool.
  * The config for an indirect vdev includes several fields, which are
  * accessed in memory by a vdev_indirect_config_t.
  */
 typedef struct vdev_indirect_config {
 	/*
 	 * Object (in MOS) which contains the indirect mapping. This object
 	 * contains an array of vdev_indirect_mapping_entry_phys_t ordered by
 	 * vimep_src. The bonus buffer for this object is a
 	 * vdev_indirect_mapping_phys_t. This object is allocated when a vdev
 	 * removal is initiated.
 	 *
 	 * Note that this object can be empty if none of the data on the vdev
 	 * has been copied yet.
 	 */
 	uint64_t	vic_mapping_object;
 
 	/*
 	 * Object (in MOS) which contains the birth times for the mapping
 	 * entries. This object contains an array of
 	 * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus
 	 * buffer for this object is a vdev_indirect_birth_phys_t. This object
 	 * is allocated when a vdev removal is initiated.
 	 *
 	 * Note that this object can be empty if none of the vdev has yet been
 	 * copied.
 	 */
 	uint64_t	vic_births_object;
 
 	/*
 	 * This is the vdev ID which was removed previous to this vdev, or
 	 * UINT64_MAX if there are no previously removed vdevs.
 	 */
 	uint64_t	vic_prev_indirect_vdev;
 } vdev_indirect_config_t;
 
 /*
  * Virtual device descriptor
  */
 struct vdev {
 	/*
 	 * Common to all vdev types.
 	 */
 	uint64_t	vdev_id;	/* child number in vdev parent	*/
 	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
 	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
 	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
 	uint64_t	vdev_asize;	/* allocatable device capacity	*/
 	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
 	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
 	uint64_t	vdev_ashift;	/* block alignment shift	*/
 
 	/*
 	 * Logical block alignment shift
 	 *
 	 * The smallest sized/aligned I/O supported by the device.
 	 */
 	uint64_t	vdev_logical_ashift;
 	/*
 	 * Physical block alignment shift
 	 *
 	 * The device supports logical I/Os with vdev_logical_ashift
 	 * size/alignment, but optimum performance will be achieved by
 	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
 	 * requests may be inflated or incur device level read-modify-write
 	 * operations.
 	 *
 	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
 	 */
 	uint64_t	vdev_physical_ashift;
 	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
 	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
 	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
 	spa_t		*vdev_spa;	/* spa for this vdev		*/
 	void		*vdev_tsd;	/* type-specific data		*/
 	vdev_t		*vdev_top;	/* top-level vdev		*/
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	vdev_stat_ex_t	vdev_stat_ex;	/* extended statistics		*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
 	boolean_t	vdev_nonrot;	/* true if solid state		*/
 	int		vdev_load_error; /* error on last load		*/
 	int		vdev_open_error; /* error on last open		*/
 	int		vdev_validate_error; /* error on last validate	*/
 	kthread_t	*vdev_open_thread; /* thread opening children	*/
 	kthread_t	*vdev_validate_thread; /* thread validating children */
 	uint64_t	vdev_crtxg;	/* txg when top-level was added */
 	uint64_t	vdev_root_zap;
 
 	/*
 	 * Top-level vdev state.
 	 */
 	uint64_t	vdev_ms_array;	/* metaslab array object	*/
 	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
 	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
 	metaslab_group_t *vdev_mg;	/* metaslab group		*/
 	metaslab_group_t *vdev_log_mg;	/* embedded slog metaslab group	*/
 	metaslab_t	**vdev_ms;	/* metaslab array		*/
 	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
 	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_fault_wanted; /* async faulted wanted?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
 	uint64_t	vdev_noalloc;	/* device is passivated?	*/
 	uint64_t	vdev_removing;	/* device is being removed?	*/
 	uint64_t	vdev_failfast;	/* device failfast setting	*/
 	boolean_t	vdev_rz_expanding; /* raidz is being expanded?	*/
 	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
 	uint64_t	vdev_top_zap;
 	vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias	*/
 
 	/* pool checkpoint related */
 	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
 
 	/* Initialize related */
 	boolean_t	vdev_initialize_exit_wanted;
 	vdev_initializing_state_t	vdev_initialize_state;
 	list_node_t	vdev_initialize_node;
 	kthread_t	*vdev_initialize_thread;
 	/* Protects vdev_initialize_thread and vdev_initialize_state. */
 	kmutex_t	vdev_initialize_lock;
 	kcondvar_t	vdev_initialize_cv;
 	uint64_t	vdev_initialize_offset[TXG_SIZE];
 	uint64_t	vdev_initialize_last_offset;
 	/* valid while initializing */
 	zfs_range_tree_t	*vdev_initialize_tree;
 	uint64_t	vdev_initialize_bytes_est;
 	uint64_t	vdev_initialize_bytes_done;
 	uint64_t	vdev_initialize_action_time;	/* start and end time */
 
 	/* TRIM related */
 	boolean_t	vdev_trim_exit_wanted;
 	boolean_t	vdev_autotrim_exit_wanted;
 	vdev_trim_state_t	vdev_trim_state;
 	list_node_t	vdev_trim_node;
 	kmutex_t	vdev_autotrim_lock;
 	kcondvar_t	vdev_autotrim_cv;
 	kcondvar_t	vdev_autotrim_kick_cv;
 	kthread_t	*vdev_autotrim_thread;
 	/* Protects vdev_trim_thread and vdev_trim_state. */
 	kmutex_t	vdev_trim_lock;
 	kcondvar_t	vdev_trim_cv;
 	kthread_t	*vdev_trim_thread;
 	uint64_t	vdev_trim_offset[TXG_SIZE];
 	uint64_t	vdev_trim_last_offset;
 	uint64_t	vdev_trim_bytes_est;
 	uint64_t	vdev_trim_bytes_done;
 	uint64_t	vdev_trim_rate;		/* requested rate (bytes/sec) */
 	uint64_t	vdev_trim_partial;	/* requested partial TRIM */
 	uint64_t	vdev_trim_secure;	/* requested secure TRIM */
 	uint64_t	vdev_trim_action_time;	/* start and end time */
 
 	/* Rebuild related */
 	boolean_t	vdev_rebuilding;
 	boolean_t	vdev_rebuild_exit_wanted;
 	boolean_t	vdev_rebuild_cancel_wanted;
 	boolean_t	vdev_rebuild_reset_wanted;
 	kmutex_t	vdev_rebuild_lock;
 	kcondvar_t	vdev_rebuild_cv;
 	kthread_t	*vdev_rebuild_thread;
 	vdev_rebuild_t	vdev_rebuild_config;
 
 	/* For limiting outstanding I/Os (initialize, TRIM) */
 	kmutex_t	vdev_initialize_io_lock;
 	kcondvar_t	vdev_initialize_io_cv;
 	uint64_t	vdev_initialize_inflight;
 	kmutex_t	vdev_trim_io_lock;
 	kcondvar_t	vdev_trim_io_cv;
 	uint64_t	vdev_trim_inflight[3];
 
 	/*
 	 * Values stored in the config for an indirect or removing vdev.
 	 */
 	vdev_indirect_config_t	vdev_indirect_config;
 
 	/*
 	 * The vdev_indirect_rwlock protects the vdev_indirect_mapping
 	 * pointer from changing on indirect vdevs (when it is condensed).
 	 * Note that removing (not yet indirect) vdevs have different
 	 * access patterns (the mapping is not accessed from open context,
 	 * e.g. from zio_read) and locking strategy (e.g. svr_lock).
 	 */
 	krwlock_t vdev_indirect_rwlock;
 	vdev_indirect_mapping_t *vdev_indirect_mapping;
 	vdev_indirect_births_t *vdev_indirect_births;
 
 	/*
 	 * In memory data structures used to manage the obsolete sm, for
 	 * indirect or removing vdevs.
 	 *
 	 * The vdev_obsolete_segments is the in-core record of the segments
 	 * that are no longer referenced anywhere in the pool (due to
 	 * being freed or remapped and not referenced by any snapshots).
 	 * During a sync, segments are added to vdev_obsolete_segments
 	 * via vdev_indirect_mark_obsolete(); at the end of each sync
 	 * pass, this is appended to vdev_obsolete_sm via
 	 * vdev_indirect_sync_obsolete().  The vdev_obsolete_lock
 	 * protects against concurrent modifications of vdev_obsolete_segments
 	 * from multiple zio threads.
 	 */
 	kmutex_t	vdev_obsolete_lock;
 	zfs_range_tree_t	*vdev_obsolete_segments;
 	space_map_t	*vdev_obsolete_sm;
 
 	/*
 	 * Protects the vdev_scan_io_queue field itself as well as the
 	 * structure's contents (when present).
 	 */
 	kmutex_t			vdev_scan_io_queue_lock;
 	struct dsl_scan_io_queue	*vdev_scan_io_queue;
 
 	/*
 	 * Leaf vdev state.
 	 */
 	zfs_range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs */
 	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_dtl_object; /* DTL object			*/
 	uint64_t	vdev_psize;	/* physical device capacity	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
 	uint64_t	vdev_faulted;	/* persistent faulted state	*/
 	uint64_t	vdev_degraded;	/* persistent degraded state	*/
 	uint64_t	vdev_removed;	/* persistent removed state	*/
 	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
 	uint64_t	vdev_rebuild_txg; /* persistent rebuilding state */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
 	char		*vdev_physpath;	/* vdev device path (if any)	*/
 	char		*vdev_enc_sysfs_path;	/* enclosure sysfs path */
 	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
 	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
 	boolean_t	vdev_has_trim;	/* TRIM is supported		*/
 	boolean_t	vdev_has_securetrim; /* secure TRIM is supported */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
 	boolean_t	vdev_forcefault; /* force online fault		*/
 	boolean_t	vdev_splitting;	/* split or repair in progress  */
 	boolean_t	vdev_delayed_close; /* delayed device close?	*/
 	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
 	boolean_t	vdev_detached;	/* device detached?		*/
 	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
 	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
 	boolean_t	vdev_isspare;	/* was a hot spare		*/
 	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
 	boolean_t	vdev_copy_uberblocks;  /* post expand copy uberblocks */
 	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	boolean_t	vdev_kobj_flag; /* kobj event record */
 	boolean_t	vdev_attaching; /* vdev attach ashift handling */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
 	uint64_t	vdev_leaf_zap;
 	hrtime_t	vdev_mmp_pending; /* 0 if write finished	*/
 	uint64_t	vdev_mmp_kstat_id;	/* to find kstat entry */
 	uint64_t	vdev_expansion_time;	/* vdev's last expansion time */
 	list_node_t	vdev_leaf_node;		/* leaf vdev list */
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
 	 * remain at the end of the structure.  DTrace will use the kernel's
 	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
 	 * larger in userland, the offsets for the rest of the fields would be
 	 * incorrect.
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
 	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
 	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 
 	/*
 	 * We rate limit ZIO delay, deadman, and checksum events, since they
 	 * can flood ZED with tons of events when a drive is acting up.
 	 *
 	 * We also rate limit Direct I/O write verify errors, since a user might
 	 * be continually manipulating a buffer that can flood ZED with tons of
 	 * events.
 	 */
 	zfs_ratelimit_t vdev_delay_rl;
 	zfs_ratelimit_t vdev_deadman_rl;
 	zfs_ratelimit_t vdev_dio_verify_rl;
 	zfs_ratelimit_t vdev_checksum_rl;
 
 	/*
 	 * Vdev properties for tuning ZED or zfsd
 	 */
 	uint64_t	vdev_checksum_n;
 	uint64_t	vdev_checksum_t;
 	uint64_t	vdev_io_n;
 	uint64_t	vdev_io_t;
 	uint64_t	vdev_slow_io_n;
 	uint64_t	vdev_slow_io_t;
 };
 
 #define	VDEV_PAD_SIZE		(8 << 10)
 /* 2 padding areas (vl_pad1 and vl_be) to skip */
 #define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
 /*
  * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock
  * ring when MMP is enabled.
  */
 #define	MMP_BLOCKS_PER_LABEL	1
 
 /* The largest uberblock we support is 8k. */
 #define	MAX_UBERBLOCK_SHIFT (13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
 	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
 	    MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
 	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
 #define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
 typedef struct vdev_phys {
 	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
 	zio_eck_t	vp_zbt;
 } vdev_phys_t;
 
 typedef enum vbe_vers {
 	/*
 	 * The bootenv file is stored as ascii text in the envblock.
 	 * It is used by the GRUB bootloader used on Linux to store the
 	 * contents of the grubenv file. The file is stored as raw ASCII,
 	 * and is protected by an embedded checksum. By default, GRUB will
 	 * check if the boot filesystem supports storing the environment data
 	 * in a special location, and if so, will invoke filesystem specific
 	 * logic to retrieve it. This can be overridden by a variable, should
 	 * the user so desire.
 	 */
 	VB_RAW = 0,
 
 	/*
 	 * The bootenv file is converted to an nvlist and then packed into the
 	 * envblock.
 	 */
 	VB_NVLIST = 1
 } vbe_vers_t;
 
 typedef struct vdev_boot_envblock {
 	uint64_t	vbe_version;
 	char		vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
 			sizeof (zio_eck_t)];
 	zio_eck_t	vbe_zbt;
 } vdev_boot_envblock_t;
 _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE,
 	"vdev_boot_envblock_t wrong size");
 
 typedef struct vdev_label {
 	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
 	vdev_boot_envblock_t	vl_be;				/*  8K */
 	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
 	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
 } vdev_label_t;						/* 256K total */
 
 /*
  * vdev_dirty() flags
  */
 #define	VDD_METASLAB	0x01
 #define	VDD_DTL		0x02
 
 /* Offset of embedded boot loader region on each label */
 #define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
 /*
  * Size of embedded boot loader region on each label.
  * The total size of the first two labels plus the boot area is 4MB.
  * On RAIDZ, this space is overwritten during RAIDZ expansion.
  */
 #define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M */
 
 /*
  * Size of label regions at the start and end of each leaf device.
  */
 #define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
 #define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
 #define	VDEV_LABELS		4
 #define	VDEV_BEST_LABEL		VDEV_LABELS
 #define	VDEV_OFFSET_IS_LABEL(vd, off)                           \
 	(((off) < VDEV_LABEL_START_SIZE) ||                     \
 	((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE)))
 
 #define	VDEV_ALLOC_LOAD		0
 #define	VDEV_ALLOC_ADD		1
 #define	VDEV_ALLOC_SPARE	2
 #define	VDEV_ALLOC_L2CACHE	3
 #define	VDEV_ALLOC_ROOTPOOL	4
 #define	VDEV_ALLOC_SPLIT	5
 #define	VDEV_ALLOC_ATTACH	6
 
 /*
  * Allocate or free a vdev
  */
 extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
     vdev_ops_t *ops);
 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
     vdev_t *parent, uint_t id, int alloctype);
 extern void vdev_free(vdev_t *vd);
 
 /*
  * Add or remove children and parents
  */
 extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
 extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
 extern void vdev_compact_children(vdev_t *pvd);
 extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
 extern void vdev_remove_parent(vdev_t *cvd);
 
 /*
  * vdev sync load and sync
  */
 extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern int vdev_load(vdev_t *vd);
 extern int vdev_dtl_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
 extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
 
 /*
  * Available vdev types.
  */
 extern vdev_ops_t vdev_root_ops;
 extern vdev_ops_t vdev_mirror_ops;
 extern vdev_ops_t vdev_replacing_ops;
 extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_draid_ops;
 extern vdev_ops_t vdev_draid_spare_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
 extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 extern vdev_ops_t vdev_indirect_ops;
 
 /*
  * Common size functions
  */
 extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
+extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg);
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
 extern uint64_t vdev_default_min_asize(vdev_t *vd);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);
 extern uint64_t vdev_get_min_alloc(vdev_t *vd);
 extern uint64_t vdev_get_nparity(vdev_t *vd);
 extern uint64_t vdev_get_ndisks(vdev_t *vd);
 
 /*
  * Global variables
  */
 extern int zfs_vdev_standard_sm_blksz;
 
 /*
  * Functions from vdev_indirect.c
  */
 extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx);
 extern boolean_t vdev_indirect_should_condense(vdev_t *vd);
 extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx);
 extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj);
 extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
 
 /*
  * Other miscellaneous functions
  */
 int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
 void vdev_metaslab_group_create(vdev_t *vd);
 uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
 #if defined(__linux__)
 int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp);
 #endif
 int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS);
 
 /*
  * Vdev ashift optimization tunables
  */
 extern uint_t zfs_vdev_min_auto_ashift;
 extern uint_t zfs_vdev_max_auto_ashift;
 int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS);
 int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS);
 
 /*
  * VDEV checksum verification for Direct I/O writes
  */
 extern uint_t zfs_vdev_direct_write_verify;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VDEV_IMPL_H */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 78adca4d7d00..ea3809ce097b 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -1,737 +1,738 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  * Copyright (c) 2024 by George Melikov. All rights reserved.
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers.
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
 	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
 	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_BLAKE3,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1U << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_ON
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ZSTD ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 
 #define	ZIO_COMPRESS_ALGO(x)	(x & SPA_COMPRESSMASK)
 #define	ZIO_COMPRESS_LEVEL(x)	((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS)
 #define	ZIO_COMPRESS_RAW(type, level)	(type | ((level) << SPA_COMPRESSBITS))
 
 #define	ZIO_COMPLEVEL_ZSTD(level)	\
 	ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 typedef enum zio_suspend_reason {
 	ZIO_SUSPEND_NONE = 0,
 	ZIO_SUSPEND_IOERR,
 	ZIO_SUSPEND_MMP,
 } zio_suspend_reason_t;
 
 /*
  * This was originally an enum type. However, those are 32-bit and there is no
  * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
  * forced to upgrade it to a uint64_t.
  *
  * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
  * FLAG.
  */
 typedef uint64_t zio_flag_t;
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 #define	ZIO_FLAG_DONT_AGGREGATE	(1ULL << 0)
 #define	ZIO_FLAG_IO_REPAIR	(1ULL << 1)
 #define	ZIO_FLAG_SELF_HEAL	(1ULL << 2)
 #define	ZIO_FLAG_RESILVER	(1ULL << 3)
 #define	ZIO_FLAG_SCRUB		(1ULL << 4)
 #define	ZIO_FLAG_SCAN_THREAD	(1ULL << 5)
 #define	ZIO_FLAG_PHYSICAL	(1ULL << 6)
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 #define	ZIO_FLAG_CANFAIL	(1ULL << 7)	/* must be first for INHERIT */
 #define	ZIO_FLAG_SPECULATIVE	(1ULL << 8)
 #define	ZIO_FLAG_CONFIG_WRITER	(1ULL << 9)
 #define	ZIO_FLAG_DONT_RETRY	(1ULL << 10)
 #define	ZIO_FLAG_NODATA		(1ULL << 12)
 #define	ZIO_FLAG_INDUCE_DAMAGE	(1ULL << 13)
 #define	ZIO_FLAG_IO_ALLOCATING	(1ULL << 14)
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 #define	ZIO_FLAG_IO_RETRY	(1ULL << 15)	/* must be first for INHERIT */
 #define	ZIO_FLAG_PROBE		(1ULL << 16)
 #define	ZIO_FLAG_TRYHARD	(1ULL << 17)
 #define	ZIO_FLAG_OPTIONAL	(1ULL << 18)
 #define	ZIO_FLAG_DIO_READ	(1ULL << 19)
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 #define	ZIO_FLAG_DONT_QUEUE	(1ULL << 20)	/* must be first for INHERIT */
 #define	ZIO_FLAG_DONT_PROPAGATE	(1ULL << 21)
 #define	ZIO_FLAG_IO_BYPASS	(1ULL << 22)
 #define	ZIO_FLAG_IO_REWRITE	(1ULL << 23)
 #define	ZIO_FLAG_RAW_COMPRESS	(1ULL << 24)
 #define	ZIO_FLAG_RAW_ENCRYPT	(1ULL << 25)
 #define	ZIO_FLAG_GANG_CHILD	(1ULL << 26)
 #define	ZIO_FLAG_DDT_CHILD	(1ULL << 27)
 #define	ZIO_FLAG_GODFATHER	(1ULL << 28)
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 29)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 30)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 31)
 #define	ZIO_FLAG_DIO_CHKSUM_ERR	(1ULL << 32)
+#define	ZIO_FLAG_PREALLOCATED	(1ULL << 33)
 
 #define	ZIO_ALLOCATOR_NONE	(-1)
 #define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_CHILD_BIT(x)		(1U << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1U << (x)))
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 #define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
 #define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
 	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_exclude_metadata;
 extern int zio_dva_throttle_enabled;
 extern const char *const zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  *
  * If the head_errlog feature is enabled a different on-disk format for error
  * logs is used. This introduces the use of an error bookmark, a four-tuple
  * <object, level, blkid, birth> that uniquely identifies any error block
  * in the pool. The birth transaction group is used to track whether the block
  * has been overwritten by newer data or added to a snapshot since its marking
  * as an error.
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 struct zbookmark_err_phys {
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 	uint64_t	zb_birth;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	uint8_t			zp_complevel;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
 	uint8_t			zp_gang_copies;
 	dmu_object_type_t	zp_type;
 	boolean_t		zp_dedup;
 	boolean_t		zp_dedup_verify;
 	boolean_t		zp_nopwrite;
 	boolean_t		zp_brtwrite;
 	boolean_t		zp_encrypt;
 	boolean_t		zp_byteorder;
 	boolean_t		zp_direct_write;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 	uint32_t		zp_zpl_smallblk;
 	dmu_object_type_t	zp_storage_type;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_sector;
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_reexecute flags are distinct from io_flags because the child must
  * be able to propagate them to the parent.  The normal io_flags are local
  * to the zio, not protected by any lock, and not modifiable by children;
  * the reexecute flags are protected by io_lock, modifiable by children,
  * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
  * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
  */
 enum trim_flag {
 	ZIO_TRIM_SECURE		= 1U << 0,
 };
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 enum zio_qstate {
 	ZIO_QS_NONE = 0,
 	ZIO_QS_QUEUED,
 	ZIO_QS_ACTIVE,
 };
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
 	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
 	enum zio_qstate	io_queue_state;	/* vdev queue state */
 	union {
 		list_node_t l;
 		avl_node_t a;
 	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
 	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	zio_flag_t	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	zio_flag_t	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	void		*io_bio;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
 
 enum blk_verify_flag {
 	BLK_VERIFY_ONLY,
 	BLK_VERIFY_LOG,
 	BLK_VERIFY_HALT
 };
 
 enum blk_config_flag {
 	BLK_CONFIG_HELD,   // SCL_VDEV held for writer
 	BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
 	BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
 	BLK_CONFIG_SKIP,   // skip checks which require SCL_VDEV
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern void zio_destroy(zio_t *zio);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, zio_flag_t flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern size_t zio_get_compression_max_size(enum zio_compress compress,
     uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len);
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(void *zio);
 extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, const char *tag);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 extern void zio_add_child_first(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern void zio_dio_chksum_verify_error_report(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress,
     uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, const char *tag,
     uint64_t type);
 extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
     uint64_t type, int error);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
 extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
 
 /*
  * Checksum ereport functions
  */
 extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, const abd_t *good_data, const abd_t *bad_data,
     struct zio_bad_cksum *info);
 
 void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr);
 extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa,
     const char *name);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 3da8976cae17..68aa2f2cb0c7 100644
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1,5702 +1,5702 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright 2017-2018 RackTop Systems.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>
  * Copyright (c) 2021 Matt Fiddaman
  */
 
 #include <ctype.h>
 #include <errno.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
 #include <stddef.h>
 #include <zone.h>
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <pwd.h>
 #include <grp.h>
 #ifdef HAVE_IDMAP
 #include <idmap.h>
 #include <aclutils.h>
 #include <directory.h>
 #endif /* HAVE_IDMAP */
 
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/dsl_crypt.h>
 #include <libzfs.h>
 #include <libzutil.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
 static __thread struct passwd gpwd;
 static __thread struct group ggrp;
 static __thread char rpbuf[2048];
 
 static int userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
 /*
  * Given a single type (not a mask of types), return the type in a human
  * readable form.
  */
 const char *
 zfs_type_to_name(zfs_type_t type)
 {
 	switch (type) {
 	case ZFS_TYPE_FILESYSTEM:
 		return (dgettext(TEXT_DOMAIN, "filesystem"));
 	case ZFS_TYPE_SNAPSHOT:
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	case ZFS_TYPE_VOLUME:
 		return (dgettext(TEXT_DOMAIN, "volume"));
 	case ZFS_TYPE_POOL:
 		return (dgettext(TEXT_DOMAIN, "pool"));
 	case ZFS_TYPE_BOOKMARK:
 		return (dgettext(TEXT_DOMAIN, "bookmark"));
 	default:
 		assert(!"unhandled zfs_type_t");
 	}
 
 	return (NULL);
 }
 
 /*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
  * provide a more meaningful error message.  We call zfs_error_aux() to
  * explain exactly why the name was not valid.
  */
 int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
     boolean_t modifying)
 {
 	namecheck_err_t why;
 	char what;
 
 	if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "snapshot delimiter '@' is not expected here"));
 		return (0);
 	}
 
 	if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing '@' delimiter in snapshot name"));
 		return (0);
 	}
 
 	if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "bookmark delimiter '#' is not expected here"));
 		return (0);
 	}
 
 	if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing '#' delimiter in bookmark name"));
 		return (0);
 	}
 
 	if (modifying && strchr(path, '%') != NULL) {
 		if (hdl != NULL)
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid character %c in name"), '%');
 		return (0);
 	}
 
 	if (entity_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
 			case NAME_ERR_TOOLONG:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is too long"));
 				break;
 
 			case NAME_ERR_LEADING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "leading slash in name"));
 				break;
 
 			case NAME_ERR_EMPTY_COMPONENT:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "empty component or misplaced '@'"
 				    " or '#' delimiter in name"));
 				break;
 
 			case NAME_ERR_TRAILING_SLASH:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "trailing slash in name"));
 				break;
 
 			case NAME_ERR_INVALCHAR:
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "invalid character "
 				    "'%c' in name"), what);
 				break;
 
 			case NAME_ERR_MULTIPLE_DELIMITERS:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple '@' and/or '#' delimiters in "
 				    "name"));
 				break;
 
 			case NAME_ERR_NOLETTER:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "pool doesn't begin with a letter"));
 				break;
 
 			case NAME_ERR_RESERVED:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "name is reserved"));
 				break;
 
 			case NAME_ERR_DISKLIKE:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "reserved disk name"));
 				break;
 
 			case NAME_ERR_SELF_REF:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "self reference, '.' is found in name"));
 				break;
 
 			case NAME_ERR_PARENT_REF:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "parent reference, '..' is found in name"));
 				break;
 
 			default:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "(%d) not defined"), why);
 				break;
 			}
 		}
 
 		return (0);
 	}
 
 	return (-1);
 }
 
 int
 zfs_name_valid(const char *name, zfs_type_t type)
 {
 	if (type == ZFS_TYPE_POOL)
 		return (zpool_name_valid(NULL, B_FALSE, name));
 	return (zfs_validate_name(NULL, name, type, B_FALSE));
 }
 
 /*
  * This function takes the raw DSL properties, and filters out the user-defined
  * properties into a separate nvlist.
  */
 static nvlist_t *
 process_user_props(zfs_handle_t *zhp, nvlist_t *props)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvpair_t *elem;
 	nvlist_t *nvl;
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		if (!zfs_prop_user(nvpair_name(elem)))
 			continue;
 
 		nvlist_t *propval = fnvpair_value_nvlist(elem);
 		if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) {
 			nvlist_free(nvl);
 			(void) no_memory(hdl);
 			return (NULL);
 		}
 	}
 
 	return (nvl);
 }
 
 static zpool_handle_t *
 zpool_add_handle(zfs_handle_t *zhp, const char *pool_name)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zpool_handle_t *zph;
 
 	if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) {
 		if (hdl->libzfs_pool_handles != NULL)
 			zph->zpool_next = hdl->libzfs_pool_handles;
 		hdl->libzfs_pool_handles = zph;
 	}
 	return (zph);
 }
 
 static zpool_handle_t *
 zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zpool_handle_t *zph = hdl->libzfs_pool_handles;
 
 	while ((zph != NULL) &&
 	    (strncmp(pool_name, zpool_get_name(zph), len) != 0))
 		zph = zph->zpool_next;
 	return (zph);
 }
 
 /*
  * Returns a handle to the pool that contains the provided dataset.
  * If a handle to that pool already exists then that handle is returned.
  * Otherwise, a new handle is created and added to the list of handles.
  */
 static zpool_handle_t *
 zpool_handle(zfs_handle_t *zhp)
 {
 	char *pool_name;
 	int len;
 	zpool_handle_t *zph;
 
 	len = strcspn(zhp->zfs_name, "/@#") + 1;
 	pool_name = zfs_alloc(zhp->zfs_hdl, len);
 	(void) strlcpy(pool_name, zhp->zfs_name, len);
 
 	zph = zpool_find_handle(zhp, pool_name, len);
 	if (zph == NULL)
 		zph = zpool_add_handle(zhp, pool_name);
 
 	free(pool_name);
 	return (zph);
 }
 
 void
 zpool_free_handles(libzfs_handle_t *hdl)
 {
 	zpool_handle_t *next, *zph = hdl->libzfs_pool_handles;
 
 	while (zph != NULL) {
 		next = zph->zpool_next;
 		zpool_close(zph);
 		zph = next;
 	}
 	hdl->libzfs_pool_handles = NULL;
 }
 
 /*
  * Utility function to gather stats (objset and zpl) for the given object.
  */
 static int
 get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
 
 	while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) {
 		if (errno == ENOMEM)
 			zcmd_expand_dst_nvlist(hdl, zc);
 		else
 			return (-1);
 	}
 	return (0);
 }
 
 /*
  * Utility function to get the received properties of the given object.
  */
 static int
 get_recvd_props_ioctl(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *recvdprops;
 	zfs_cmd_t zc = {"\0"};
 	int err;
 
 	zcmd_alloc_dst_nvlist(hdl, &zc, 0);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
 		if (errno == ENOMEM)
 			zcmd_expand_dst_nvlist(hdl, &zc);
 		else {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 	}
 
 	err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
 	zcmd_free_nvlists(&zc);
 	if (err != 0)
 		return (-1);
 
 	nvlist_free(zhp->zfs_recvd_props);
 	zhp->zfs_recvd_props = recvdprops;
 
 	return (0);
 }
 
 static int
 put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	nvlist_t *allprops, *userprops;
 
 	zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
 
 	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
 		return (-1);
 	}
 
 	/*
 	 * XXX Why do we store the user props separately, in addition to
 	 * storing them in zfs_props?
 	 */
 	if ((userprops = process_user_props(zhp, allprops)) == NULL) {
 		nvlist_free(allprops);
 		return (-1);
 	}
 
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 
 	zhp->zfs_props = allprops;
 	zhp->zfs_user_props = userprops;
 
 	return (0);
 }
 
 static int
 get_stats(zfs_handle_t *zhp)
 {
 	int rc = 0;
 	zfs_cmd_t zc = {"\0"};
 
 	zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0);
 
 	if (get_stats_ioctl(zhp, &zc) != 0)
 		rc = -1;
 	else if (put_stats_zhdl(zhp, &zc) != 0)
 		rc = -1;
 	zcmd_free_nvlists(&zc);
 	return (rc);
 }
 
 /*
  * Refresh the properties currently stored in the handle.
  */
 void
 zfs_refresh_properties(zfs_handle_t *zhp)
 {
 	(void) get_stats(zhp);
 }
 
 /*
  * Makes a handle from the given dataset name.  Used by zfs_open() and
  * zfs_iter_* to create child handles on the fly.
  */
 static int
 make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
 	if (put_stats_zhdl(zhp, zc) != 0)
 		return (-1);
 
 	/*
 	 * We've managed to open the dataset and gather statistics.  Determine
 	 * the high-level type.
 	 */
 	if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
 		zhp->zfs_head_type = ZFS_TYPE_VOLUME;
 	} else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) {
 		zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM;
 	} else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) {
 		errno = EINVAL;
 		return (-1);
 	} else if (zhp->zfs_dmustats.dds_inconsistent) {
 		errno = EBUSY;
 		return (-1);
 	} else {
 		abort();
 	}
 
 	if (zhp->zfs_dmustats.dds_is_snapshot)
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
 	else
 		abort();	/* we should never see any other types */
 
 	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
 		return (-1);
 
 	return (0);
 }
 
 zfs_handle_t *
 make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 {
 	zfs_cmd_t zc = {"\0"};
 
 	zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t));
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 	zcmd_alloc_dst_nvlist(hdl, &zc, 0);
 
 	if (get_stats_ioctl(zhp, &zc) == -1) {
 		zcmd_free_nvlists(&zc);
 		free(zhp);
 		return (NULL);
 	}
 	if (make_dataset_handle_common(zhp, &zc) == -1) {
 		free(zhp);
 		zhp = NULL;
 	}
 	zcmd_free_nvlists(&zc);
 	return (zhp);
 }
 
 zfs_handle_t *
 make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t));
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = hdl;
 	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
 	if (make_dataset_handle_common(zhp, zc) == -1) {
 		free(zhp);
 		return (NULL);
 	}
 	return (zhp);
 }
 
 zfs_handle_t *
 make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc)
 {
 	zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t));
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = pzhp->zfs_hdl;
 	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
 	zhp->zfs_head_type = pzhp->zfs_type;
 	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	zhp->zpool_hdl = zpool_handle(zhp);
 
 	if (zc->zc_objset_stats.dds_creation_txg != 0) {
 		/* structure assignment */
 		zhp->zfs_dmustats = zc->zc_objset_stats;
 	} else {
 		if (get_stats_ioctl(zhp, zc) == -1) {
 			zcmd_free_nvlists(zc);
 			free(zhp);
 			return (NULL);
 		}
 		if (make_dataset_handle_common(zhp, zc) == -1) {
 			zcmd_free_nvlists(zc);
 			free(zhp);
 			return (NULL);
 		}
 	}
 
 	if (zhp->zfs_dmustats.dds_is_snapshot ||
 	    strchr(zc->zc_name, '@') != NULL)
 		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL)
 		zhp->zfs_type = ZFS_TYPE_VOLUME;
 	else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS)
 		zhp->zfs_type = ZFS_TYPE_FILESYSTEM;
 
 	return (zhp);
 }
 
 zfs_handle_t *
 zfs_handle_dup(zfs_handle_t *zhp_orig)
 {
 	zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t));
 
 	if (zhp == NULL)
 		return (NULL);
 
 	zhp->zfs_hdl = zhp_orig->zfs_hdl;
 	zhp->zpool_hdl = zhp_orig->zpool_hdl;
 	(void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
 	    sizeof (zhp->zfs_name));
 	zhp->zfs_type = zhp_orig->zfs_type;
 	zhp->zfs_head_type = zhp_orig->zfs_head_type;
 	zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
 	if (zhp_orig->zfs_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	if (zhp_orig->zfs_user_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_user_props,
 		    &zhp->zfs_user_props, 0) != 0) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	if (zhp_orig->zfs_recvd_props != NULL) {
 		if (nvlist_dup(zhp_orig->zfs_recvd_props,
 		    &zhp->zfs_recvd_props, 0)) {
 			(void) no_memory(zhp->zfs_hdl);
 			zfs_close(zhp);
 			return (NULL);
 		}
 	}
 	zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
 	if (zhp_orig->zfs_mntopts != NULL) {
 		zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
 		    zhp_orig->zfs_mntopts);
 	}
 	zhp->zfs_props_table = zhp_orig->zfs_props_table;
 	return (zhp);
 }
 
 boolean_t
 zfs_bookmark_exists(const char *path)
 {
 	nvlist_t *bmarks;
 	nvlist_t *props;
 	char fsname[ZFS_MAX_DATASET_NAME_LEN];
 	char *bmark_name;
 	char *pound;
 	int err;
 	boolean_t rv;
 
 	(void) strlcpy(fsname, path, sizeof (fsname));
 	pound = strchr(fsname, '#');
 	if (pound == NULL)
 		return (B_FALSE);
 
 	*pound = '\0';
 	bmark_name = pound + 1;
 	props = fnvlist_alloc();
 	err = lzc_get_bookmarks(fsname, props, &bmarks);
 	nvlist_free(props);
 	if (err != 0) {
 		nvlist_free(bmarks);
 		return (B_FALSE);
 	}
 
 	rv = nvlist_exists(bmarks, bmark_name);
 	nvlist_free(bmarks);
 	return (rv);
 }
 
 zfs_handle_t *
 make_bookmark_handle(zfs_handle_t *parent, const char *path,
     nvlist_t *bmark_props)
 {
 	zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t));
 
 	if (zhp == NULL)
 		return (NULL);
 
 	/* Fill in the name. */
 	zhp->zfs_hdl = parent->zfs_hdl;
 	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
 
 	/* Set the property lists. */
 	if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
 		free(zhp);
 		return (NULL);
 	}
 
 	/* Set the types. */
 	zhp->zfs_head_type = parent->zfs_head_type;
 	zhp->zfs_type = ZFS_TYPE_BOOKMARK;
 
 	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
 		nvlist_free(zhp->zfs_props);
 		free(zhp);
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 struct zfs_open_bookmarks_cb_data {
 	const char *path;
 	zfs_handle_t *zhp;
 };
 
 static int
 zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data)
 {
 	struct zfs_open_bookmarks_cb_data *dp = data;
 
 	/*
 	 * Is it the one we are looking for?
 	 */
 	if (strcmp(dp->path, zfs_get_name(zhp)) == 0) {
 		/*
 		 * We found it.  Save it and let the caller know we are done.
 		 */
 		dp->zhp = zhp;
 		return (EEXIST);
 	}
 
 	/*
 	 * Not found.  Close the handle and ask for another one.
 	 */
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Opens the given snapshot, bookmark, filesystem, or volume.   The 'types'
  * argument is a mask of acceptable types.  The function will print an
  * appropriate error message and return NULL if it can't be opened.
  */
 zfs_handle_t *
 zfs_open(libzfs_handle_t *hdl, const char *path, int types)
 {
 	zfs_handle_t *zhp;
 	char errbuf[ERRBUFLEN];
 	char *bookp;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot open '%s'"), path);
 
 	/*
 	 * Validate the name before we even try to open it.
 	 */
 	if (!zfs_validate_name(hdl, path, types, B_FALSE)) {
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		errno = EINVAL;
 		return (NULL);
 	}
 
 	/*
 	 * Bookmarks needs to be handled separately.
 	 */
 	bookp = strchr(path, '#');
 	if (bookp == NULL) {
 		/*
 		 * Try to get stats for the dataset, which will tell us if it
 		 * exists.
 		 */
 		errno = 0;
 		if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
 			(void) zfs_standard_error(hdl, errno, errbuf);
 			return (NULL);
 		}
 	} else {
 		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 		zfs_handle_t *pzhp;
 		struct zfs_open_bookmarks_cb_data cb_data = {path, NULL};
 
 		/*
 		 * We need to cut out '#' and everything after '#'
 		 * to get the parent dataset name only.
 		 */
 		assert(bookp - path < sizeof (dsname));
 		(void) strlcpy(dsname, path,
 		    MIN(sizeof (dsname), bookp - path + 1));
 
 		/*
 		 * Create handle for the parent dataset.
 		 */
 		errno = 0;
 		if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) {
 			(void) zfs_standard_error(hdl, errno, errbuf);
 			return (NULL);
 		}
 
 		/*
 		 * Iterate bookmarks to find the right one.
 		 */
 		errno = 0;
 		if ((zfs_iter_bookmarks_v2(pzhp, 0, zfs_open_bookmarks_cb,
 		    &cb_data) == 0) && (cb_data.zhp == NULL)) {
 			(void) zfs_error(hdl, EZFS_NOENT, errbuf);
 			zfs_close(pzhp);
 			errno = ENOENT;
 			return (NULL);
 		}
 		if (cb_data.zhp == NULL) {
 			(void) zfs_standard_error(hdl, errno, errbuf);
 			zfs_close(pzhp);
 			return (NULL);
 		}
 		zhp = cb_data.zhp;
 
 		/*
 		 * Cleanup.
 		 */
 		zfs_close(pzhp);
 	}
 
 	if (!(types & zhp->zfs_type)) {
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		errno = EINVAL;
 		return (NULL);
 	}
 
 	return (zhp);
 }
 
 /*
  * Release a ZFS handle.  Nothing to do but free the associated memory.
  */
 void
 zfs_close(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_mntopts)
 		free(zhp->zfs_mntopts);
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
 	nvlist_free(zhp->zfs_recvd_props);
 	free(zhp);
 }
 
 typedef struct mnttab_node {
 	struct mnttab mtn_mt;
 	avl_node_t mtn_node;
 } mnttab_node_t;
 
 static int
 libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
 {
 	const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1;
 	const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2;
 	int rv;
 
 	rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
 
 	return (TREE_ISIGN(rv));
 }
 
 void
 libzfs_mnttab_init(libzfs_handle_t *hdl)
 {
 	pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
 	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
 	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
 	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
 }
 
 static int
 libzfs_mnttab_update(libzfs_handle_t *hdl)
 {
 	FILE *mnttab;
 	struct mnttab entry;
 
 	if ((mnttab = fopen(MNTTAB, "re")) == NULL)
 		return (ENOENT);
 
 	while (getmntent(mnttab, &entry) == 0) {
 		mnttab_node_t *mtn;
 		avl_index_t where;
 
 		if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
 			continue;
 
 		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
 		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
 		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
 		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
 		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
 
 		/* Exclude duplicate mounts */
 		if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) {
 			free(mtn->mtn_mt.mnt_special);
 			free(mtn->mtn_mt.mnt_mountp);
 			free(mtn->mtn_mt.mnt_fstype);
 			free(mtn->mtn_mt.mnt_mntopts);
 			free(mtn);
 			continue;
 		}
 
 		avl_add(&hdl->libzfs_mnttab_cache, mtn);
 	}
 
 	(void) fclose(mnttab);
 	return (0);
 }
 
 void
 libzfs_mnttab_fini(libzfs_handle_t *hdl)
 {
 	void *cookie = NULL;
 	mnttab_node_t *mtn;
 
 	while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie))
 	    != NULL) {
 		free(mtn->mtn_mt.mnt_special);
 		free(mtn->mtn_mt.mnt_mountp);
 		free(mtn->mtn_mt.mnt_fstype);
 		free(mtn->mtn_mt.mnt_mntopts);
 		free(mtn);
 	}
 	avl_destroy(&hdl->libzfs_mnttab_cache);
 	(void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
 }
 
 void
 libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
 {
 	hdl->libzfs_mnttab_enable = enable;
 }
 
 int
 libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
     struct mnttab *entry)
 {
 	FILE *mnttab;
 	mnttab_node_t find;
 	mnttab_node_t *mtn;
 	int ret = ENOENT;
 
 	if (!hdl->libzfs_mnttab_enable) {
 		struct mnttab srch = { 0 };
 
 		if (avl_numnodes(&hdl->libzfs_mnttab_cache))
 			libzfs_mnttab_fini(hdl);
 
 		if ((mnttab = fopen(MNTTAB, "re")) == NULL)
 			return (ENOENT);
 
 		srch.mnt_special = (char *)fsname;
 		srch.mnt_fstype = (char *)MNTTYPE_ZFS;
 		ret = getmntany(mnttab, entry, &srch) ? ENOENT : 0;
 		(void) fclose(mnttab);
 		return (ret);
 	}
 
 	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
 	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
 		int error;
 
 		if ((error = libzfs_mnttab_update(hdl)) != 0) {
 			pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 			return (error);
 		}
 	}
 
 	find.mtn_mt.mnt_special = (char *)fsname;
 	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
 	if (mtn) {
 		*entry = mtn->mtn_mt;
 		ret = 0;
 	}
 	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 	return (ret);
 }
 
 void
 libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
     const char *mountp, const char *mntopts)
 {
 	mnttab_node_t *mtn;
 
 	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
 	if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) {
 		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
 		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
 		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
 		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
 		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
 		/*
 		 * Another thread may have already added this entry
 		 * via libzfs_mnttab_update. If so we should skip it.
 		 */
 		if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) {
 			free(mtn->mtn_mt.mnt_special);
 			free(mtn->mtn_mt.mnt_mountp);
 			free(mtn->mtn_mt.mnt_fstype);
 			free(mtn->mtn_mt.mnt_mntopts);
 			free(mtn);
 		} else {
 			avl_add(&hdl->libzfs_mnttab_cache, mtn);
 		}
 	}
 	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 }
 
 void
 libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
 {
 	mnttab_node_t find;
 	mnttab_node_t *ret;
 
 	pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
 	find.mtn_mt.mnt_special = (char *)fsname;
 	if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
 	    != NULL) {
 		avl_remove(&hdl->libzfs_mnttab_cache, ret);
 		free(ret->mtn_mt.mnt_special);
 		free(ret->mtn_mt.mnt_mountp);
 		free(ret->mtn_mt.mnt_fstype);
 		free(ret->mtn_mt.mnt_mntopts);
 		free(ret);
 	}
 	pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
 }
 
 int
 zfs_spa_version(zfs_handle_t *zhp, int *spa_version)
 {
 	zpool_handle_t *zpool_handle = zhp->zpool_hdl;
 
 	if (zpool_handle == NULL)
 		return (-1);
 
 	*spa_version = zpool_get_prop_int(zpool_handle,
 	    ZPOOL_PROP_VERSION, NULL);
 	return (0);
 }
 
 /*
  * The choice of reservation property depends on the SPA version.
  */
 static int
 zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop)
 {
 	int spa_version;
 
 	if (zfs_spa_version(zhp, &spa_version) < 0)
 		return (-1);
 
 	if (spa_version >= SPA_VERSION_REFRESERVATION)
 		*resv_prop = ZFS_PROP_REFRESERVATION;
 	else
 		*resv_prop = ZFS_PROP_RESERVATION;
 
 	return (0);
 }
 
 /*
  * Given an nvlist of properties to set, validates that they are correct, and
  * parses any numeric properties (index, boolean, etc) if they are specified as
  * strings.
  */
 nvlist_t *
 zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
     uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl,
     boolean_t key_params_ok, const char *errbuf)
 {
 	nvpair_t *elem;
 	uint64_t intval;
 	const char *strval;
 	zfs_prop_t prop;
 	nvlist_t *ret;
 	int chosen_normal = -1;
 	int chosen_utf = -1;
 	int set_maxbs = 0;
 
 	if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
 		return (NULL);
 	}
 
 	/*
 	 * Make sure this property is valid and applies to this type.
 	 */
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 
 		prop = zfs_name_to_prop(propname);
 		if (prop == ZPROP_USERPROP && zfs_prop_user(propname)) {
 			/*
 			 * This is a user property: make sure it's a
 			 * string, and that it's less than ZAP_MAXNAMELEN.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_STRING) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a string"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property name '%s' is too long"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			(void) nvpair_value_string(elem, &strval);
 			if (nvlist_add_string(ret, propname, strval) != 0) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 			continue;
 		}
 
 		/*
 		 * Currently, only user properties can be modified on
 		 * snapshots.
 		 */
 		if (type == ZFS_TYPE_SNAPSHOT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "this property can not be modified for snapshots"));
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (prop == ZPROP_USERPROP && zfs_prop_userquota(propname)) {
 			zfs_userquota_prop_t uqtype;
 			char *newpropname = NULL;
 			char domain[128];
 			uint64_t rid;
 			uint64_t valary[3];
 			int rc;
 
 			if (userquota_propname_decode(propname, zoned,
 			    &uqtype, domain, sizeof (domain), &rid) != 0) {
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN,
 				    "'%s' has an invalid user/group name"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			if (uqtype != ZFS_PROP_USERQUOTA &&
 			    uqtype != ZFS_PROP_GROUPQUOTA &&
 			    uqtype != ZFS_PROP_USEROBJQUOTA &&
 			    uqtype != ZFS_PROP_GROUPOBJQUOTA &&
 			    uqtype != ZFS_PROP_PROJECTQUOTA &&
 			    uqtype != ZFS_PROP_PROJECTOBJQUOTA) {
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 				    propname);
 				(void) zfs_error(hdl, EZFS_PROPREADONLY,
 				    errbuf);
 				goto error;
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				(void) nvpair_value_string(elem, &strval);
 				if (strcmp(strval, "none") == 0) {
 					intval = 0;
 				} else if (zfs_nicestrtonum(hdl,
 				    strval, &intval) != 0) {
 					(void) zfs_error(hdl,
 					    EZFS_BADPROP, errbuf);
 					goto error;
 				}
 			} else if (nvpair_type(elem) ==
 			    DATA_TYPE_UINT64) {
 				(void) nvpair_value_uint64(elem, &intval);
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "use 'none' to disable "
 					    "{user|group|project}quota"));
 					goto error;
 				}
 			} else {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be a number"), propname);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			/*
 			 * Encode the prop name as
 			 * userquota@<hex-rid>-domain, to make it easy
 			 * for the kernel to decode.
 			 */
 			rc = asprintf(&newpropname, "%s%llx-%s",
 			    zfs_userquota_prop_prefixes[uqtype],
 			    (longlong_t)rid, domain);
 			if (rc == -1 || newpropname == NULL) {
 				(void) no_memory(hdl);
 				goto error;
 			}
 
 			valary[0] = uqtype;
 			valary[1] = rid;
 			valary[2] = intval;
 			if (nvlist_add_uint64_array(ret, newpropname,
 			    valary, 3) != 0) {
 				free(newpropname);
 				(void) no_memory(hdl);
 				goto error;
 			}
 			free(newpropname);
 			continue;
 		} else if (prop == ZPROP_USERPROP &&
 		    zfs_prop_written(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		if (prop == ZPROP_INVAL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), propname);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 		}
 
 		if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' does not "
 			    "apply to datasets of this type"), propname);
 			(void) zfs_error(hdl, EZFS_PROPTYPE, errbuf);
 			goto error;
 		}
 
 		if (zfs_prop_readonly(prop) &&
 		    !(zfs_prop_setonce(prop) && zhp == NULL) &&
 		    !(zfs_prop_encryption_key_param(prop) && key_params_ok)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
 			    propname);
 			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
 			goto error;
 		}
 
 		if (zprop_parse_value(hdl, elem, prop, type, ret,
 		    &strval, &intval, errbuf) != 0)
 			goto error;
 
 		/*
 		 * Perform some additional checks for specific properties.
 		 */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 		{
 			int version;
 
 			if (zhp == NULL)
 				break;
 			version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
 			if (intval < version) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "Can not downgrade; already at version %u"),
 				    version);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 		}
 
 		case ZFS_PROP_VOLBLOCKSIZE:
 		case ZFS_PROP_RECORDSIZE:
 		{
 			int maxbs = SPA_MAXBLOCKSIZE;
 			char buf[64];
 
 			if (zpool_hdl != NULL) {
 				maxbs = zpool_get_prop_int(zpool_hdl,
 				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
 			}
 			/*
 			 * The value must be a power of two between
 			 * SPA_MINBLOCKSIZE and maxbs.
 			 */
 			if (intval < SPA_MINBLOCKSIZE ||
 			    intval > maxbs || !ISP2(intval)) {
 				zfs_nicebytes(maxbs, buf, sizeof (buf));
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' must be power of 2 from 512B "
 				    "to %s"), propname, buf);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			/* save the ZFS_PROP_RECORDSIZE during create op */
 			if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) {
 				set_maxbs = intval;
 			}
 			break;
 		}
 
 		case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
 		{
 			int maxbs =
 			    set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs;
 			char buf[64];
 
 			if (zpool_hdl != NULL) {
 				char state[64] = "";
 
 				maxbs = zpool_get_prop_int(zpool_hdl,
 				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
 
 				/*
 				 * Issue a warning but do not fail so that
 				 * tests for settable properties succeed.
 				 */
 				if (zpool_prop_get_feature(zpool_hdl,
 				    "feature@allocation_classes", state,
 				    sizeof (state)) != 0 ||
 				    strcmp(state, ZFS_FEATURE_ACTIVE) != 0) {
 					(void) fprintf(stderr, gettext(
 					    "%s: property requires a special "
 					    "device in the pool\n"), propname);
 				}
 			}
 			if (intval != 0 &&
 			    (intval < SPA_MINBLOCKSIZE ||
 			    intval > maxbs || !ISP2(intval))) {
 				zfs_nicebytes(maxbs, buf, sizeof (buf));
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "invalid '%s=%llu' property: must be zero "
 				    "or a power of 2 from 512B to %s"),
 				    propname, (unsigned long long)intval, buf);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 		}
 
 		case ZFS_PROP_MLSLABEL:
 		{
 #ifdef HAVE_MLSLABEL
 			/*
 			 * Verify the mlslabel string and convert to
 			 * internal hex label string.
 			 */
 
 			m_label_t *new_sl;
 			char *hex = NULL;	/* internal label string */
 
 			/* Default value is already OK. */
 			if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 				break;
 
 			/* Verify the label can be converted to binary form */
 			if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
 			    (str_to_label(strval, &new_sl, MAC_LABEL,
 			    L_NO_CORRECTION, NULL) == -1)) {
 				goto badlabel;
 			}
 
 			/* Now translate to hex internal label string */
 			if (label_to_str(new_sl, &hex, M_INTERNAL,
 			    DEF_NAMES) != 0) {
 				if (hex)
 					free(hex);
 				goto badlabel;
 			}
 			m_label_free(new_sl);
 
 			/* If string is already in internal form, we're done. */
 			if (strcmp(strval, hex) == 0) {
 				free(hex);
 				break;
 			}
 
 			/* Replace the label string with the internal form. */
 			(void) nvlist_remove(ret, zfs_prop_to_name(prop),
 			    DATA_TYPE_STRING);
 			fnvlist_add_string(ret, zfs_prop_to_name(prop), hex);
 			free(hex);
 
 			break;
 
 badlabel:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid mlslabel '%s'"), strval);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			m_label_free(new_sl);	/* OK if null */
 			goto error;
 #else
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "mlslabels are unsupported"));
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			goto error;
 #endif /* HAVE_MLSLABEL */
 		}
 
 		case ZFS_PROP_MOUNTPOINT:
 		{
 			namecheck_err_t why;
 
 			if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 ||
 			    strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0)
 				break;
 
 			if (mountpoint_namecheck(strval, &why)) {
 				switch (why) {
 				case NAME_ERR_LEADING_SLASH:
 					zfs_error_aux(hdl,
 					    dgettext(TEXT_DOMAIN,
 					    "'%s' must be an absolute path, "
 					    "'none', or 'legacy'"), propname);
 					break;
 				case NAME_ERR_TOOLONG:
 					zfs_error_aux(hdl,
 					    dgettext(TEXT_DOMAIN,
 					    "component of '%s' is too long"),
 					    propname);
 					break;
 
 				default:
 					zfs_error_aux(hdl,
 					    dgettext(TEXT_DOMAIN,
 					    "(%d) not defined"),
 					    why);
 					break;
 				}
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			zfs_fallthrough;
 		}
 
 		case ZFS_PROP_SHARESMB:
 		case ZFS_PROP_SHARENFS:
 			/*
 			 * For the mountpoint and sharenfs or sharesmb
 			 * properties, check if it can be set in a
 			 * global/non-global zone based on
 			 * the zoned property value:
 			 *
 			 *		global zone	    non-global zone
 			 * --------------------------------------------------
 			 * zoned=on	mountpoint (no)	    mountpoint (yes)
 			 *		sharenfs (no)	    sharenfs (no)
 			 *		sharesmb (no)	    sharesmb (no)
 			 *
 			 * zoned=off	mountpoint (yes)	N/A
 			 *		sharenfs (yes)
 			 *		sharesmb (yes)
 			 */
 			if (zoned) {
 				if (getzoneid() == GLOBAL_ZONEID) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set on "
 					    "dataset in a non-global zone"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				} else if (prop == ZFS_PROP_SHARENFS ||
 				    prop == ZFS_PROP_SHARESMB) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set in "
 					    "a non-global zone"), propname);
 					(void) zfs_error(hdl, EZFS_ZONED,
 					    errbuf);
 					goto error;
 				}
 			} else if (getzoneid() != GLOBAL_ZONEID) {
 				/*
 				 * If zoned property is 'off', this must be in
 				 * a global zone. If not, something is wrong.
 				 */
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' cannot be set while dataset "
 				    "'zoned' property is set"), propname);
 				(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 				goto error;
 			}
 
 			/*
 			 * At this point, it is legitimate to set the
 			 * property. Now we want to make sure that the
 			 * property value is valid if it is sharenfs.
 			 */
 			if ((prop == ZFS_PROP_SHARENFS ||
 			    prop == ZFS_PROP_SHARESMB) &&
 			    strcmp(strval, "on") != 0 &&
 			    strcmp(strval, "off") != 0) {
 				enum sa_protocol proto;
 
 				if (prop == ZFS_PROP_SHARESMB)
 					proto = SA_PROTOCOL_SMB;
 				else
 					proto = SA_PROTOCOL_NFS;
 
 				if (sa_validate_shareopts(strval, proto) !=
 				    SA_OK) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be set to invalid "
 					    "options"), propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 			}
 
 			break;
 
 		case ZFS_PROP_KEYLOCATION:
 			if (!zfs_prop_valid_keylocation(strval, B_FALSE)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "invalid keylocation"));
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 
 			if (zhp != NULL) {
 				uint64_t crypt =
 				    zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION);
 
 				if (crypt == ZIO_CRYPT_OFF &&
 				    strcmp(strval, "none") != 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "keylocation must be 'none' "
 					    "for unencrypted datasets"));
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				} else if (crypt != ZIO_CRYPT_OFF &&
 				    strcmp(strval, "none") == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "keylocation must not be 'none' "
 					    "for encrypted datasets"));
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 			}
 			break;
 
 		case ZFS_PROP_PBKDF2_ITERS:
 			if (intval < MIN_PBKDF2_ITERATIONS) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "minimum pbkdf2 iterations is %u"),
 				    MIN_PBKDF2_ITERATIONS);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
 			break;
 
 		case ZFS_PROP_UTF8ONLY:
 			chosen_utf = (int)intval;
 			break;
 
 		case ZFS_PROP_NORMALIZE:
 			chosen_normal = (int)intval;
 			break;
 
 		default:
 			break;
 		}
 
 		/*
 		 * For changes to existing volumes, we have some additional
 		 * checks to enforce.
 		 */
 		if (type == ZFS_TYPE_VOLUME && zhp != NULL) {
 			uint64_t blocksize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLBLOCKSIZE);
 			char buf[64];
 
 			switch (prop) {
 			case ZFS_PROP_VOLSIZE:
 				if (intval % blocksize != 0) {
 					zfs_nicebytes(blocksize, buf,
 					    sizeof (buf));
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' must be a multiple of "
 					    "volume block size (%s)"),
 					    propname, buf);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 
 				if (intval == 0) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "'%s' cannot be zero"),
 					    propname);
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 
 			default:
 				break;
 			}
 		}
 
 		/* check encryption properties */
 		if (zhp != NULL) {
 			int64_t crypt = zfs_prop_get_int(zhp,
 			    ZFS_PROP_ENCRYPTION);
 
 			switch (prop) {
 			case ZFS_PROP_COPIES:
 				if (crypt != ZIO_CRYPT_OFF && intval > 2) {
 					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 					    "encrypted datasets cannot have "
 					    "3 copies"));
 					(void) zfs_error(hdl, EZFS_BADPROP,
 					    errbuf);
 					goto error;
 				}
 				break;
 			default:
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If normalization was chosen, but no UTF8 choice was made,
 	 * enforce rejection of non-UTF8 names.
 	 *
 	 * If normalization was chosen, but rejecting non-UTF8 names
 	 * was explicitly not chosen, it is an error.
 	 *
 	 * If utf8only was turned off, but the parent has normalization,
 	 * turn off normalization.
 	 */
 	if (chosen_normal > 0 && chosen_utf < 0) {
 		if (nvlist_add_uint64(ret,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) {
 			(void) no_memory(hdl);
 			goto error;
 		}
 	} else if (chosen_normal > 0 && chosen_utf == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "'%s' must be set 'on' if normalization chosen"),
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		goto error;
 	} else if (chosen_normal < 0 && chosen_utf == 0) {
 		if (nvlist_add_uint64(ret,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), 0) != 0) {
 			(void) no_memory(hdl);
 			goto error;
 		}
 	}
 	return (ret);
 
 error:
 	nvlist_free(ret);
 	return (NULL);
 }
 
 static int
 zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 {
 	uint64_t old_volsize;
 	uint64_t new_volsize;
 	uint64_t old_reservation;
 	uint64_t new_reservation;
 	zfs_prop_t resv_prop;
 	nvlist_t *props;
 	zpool_handle_t *zph = zpool_handle(zhp);
 
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
 	 * make sure that it matches the reservation, or add it if necessary.
 	 */
 	old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 	if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 		return (-1);
 	old_reservation = zfs_prop_get_int(zhp, resv_prop);
 
 	props = fnvlist_alloc();
 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
 
 	if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
 	    old_reservation) || nvlist_exists(nvl,
 	    zfs_prop_to_name(resv_prop))) {
 		fnvlist_free(props);
 		return (0);
 	}
 	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 	    &new_volsize) != 0) {
 		fnvlist_free(props);
 		return (-1);
 	}
 	new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
 	fnvlist_free(props);
 
 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
 	    new_reservation) != 0) {
 		(void) no_memory(zhp->zfs_hdl);
 		return (-1);
 	}
 	return (1);
 }
 
 /*
  * Helper for 'zfs {set|clone} refreservation=auto'.  Must be called after
  * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value.
  * Return codes must match zfs_add_synthetic_resv().
  */
 static int
 zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
 {
 	uint64_t volsize;
 	uint64_t resvsize;
 	zfs_prop_t prop;
 	nvlist_t *props;
 
 	if (!ZFS_IS_VOLUME(zhp)) {
 		return (0);
 	}
 
 	if (zfs_which_resv_prop(zhp, &prop) != 0) {
 		return (-1);
 	}
 
 	if (prop != ZFS_PROP_REFRESERVATION) {
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) {
 		/* No value being set, so it can't be "auto" */
 		return (0);
 	}
 	if (resvsize != UINT64_MAX) {
 		/* Being set to a value other than "auto" */
 		return (0);
 	}
 
 	props = fnvlist_alloc();
 
 	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
 
 	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 	    &volsize) != 0) {
 		volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 	}
 
 	resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
 	    props);
 	fnvlist_free(props);
 
 	(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
 	if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) {
 		(void) no_memory(zhp->zfs_hdl);
 		return (-1);
 	}
 	return (1);
 }
 
 static boolean_t
 zfs_is_namespace_prop(zfs_prop_t prop)
 {
 	switch (prop) {
 
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_RELATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_XATTR:
 	case ZFS_PROP_NBMAND:
 		return (B_TRUE);
 
 	default:
 		return (B_FALSE);
 	}
 }
 
 /*
  * Given a property name and value, set the property for the given dataset.
  */
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
 	int ret = -1;
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl = NULL;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 ||
 	    nvlist_add_string(nvl, propname, propval) != 0) {
 		(void) no_memory(hdl);
 		goto error;
 	}
 
 	ret = zfs_prop_set_list(zhp, nvl);
 
 error:
 	nvlist_free(nvl);
 	return (ret);
 }
 
 /*
  * Given an nvlist of property names and values, set the properties for the
  * given dataset.
  */
 int
 zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
 {
 	return (zfs_prop_set_list_flags(zhp, props, 0));
 }
 
 /*
  * Given an nvlist of property names, values and flags, set the properties
  * for the given dataset. If ZFS_SET_NOMOUNT is set, it allows to update
  * mountpoint, sharenfs and sharesmb properties without (un/re)mounting
  * and (un/re)sharing the dataset.
  */
 int
 zfs_prop_set_list_flags(zfs_handle_t *zhp, nvlist_t *props, int flags)
 {
 	zfs_cmd_t zc = {"\0"};
 	int ret = -1;
 	prop_changelist_t **cls = NULL;
 	int cl_idx;
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	nvlist_t *nvl;
 	int nvl_len = 0;
 	int added_resv = 0;
 	zfs_prop_t prop;
 	boolean_t nsprop = B_FALSE;
 	nvpair_t *elem;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zfs_name);
 
 	if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
 	    B_FALSE, errbuf)) == NULL)
 		goto error;
 
 	/*
 	 * We have to check for any extra properties which need to be added
 	 * before computing the length of the nvlist.
 	 */
 	for (elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem)) {
 		if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
 		    (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
 			goto error;
 		}
 	}
 
 	if (added_resv != 1 &&
 	    (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) {
 		goto error;
 	}
 
 	/*
 	 * Check how many properties we're setting and allocate an array to
 	 * store changelist pointers for postfix().
 	 */
 	for (elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem))
 		nvl_len++;
 	if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
 		goto error;
 
 	cl_idx = 0;
 	for (elem = nvlist_next_nvpair(nvl, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(nvl, elem)) {
 
 		prop = zfs_name_to_prop(nvpair_name(elem));
 		nsprop |= zfs_is_namespace_prop(prop);
 
 		assert(cl_idx < nvl_len);
 		/*
 		 * We don't want to unmount & remount the dataset when changing
 		 * its canmount property to 'on' or 'noauto'.  We only use
 		 * the changelist logic to unmount when setting canmount=off.
 		 */
 		if (prop != ZFS_PROP_CANMOUNT ||
 		    (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF &&
 		    zfs_is_mounted(zhp, NULL))) {
 			cls[cl_idx] = changelist_gather(zhp, prop,
 			    ((flags & ZFS_SET_NOMOUNT) ?
 			    CL_GATHER_DONT_UNMOUNT : 0), 0);
 			if (cls[cl_idx] == NULL)
 				goto error;
 		}
 
 		if (prop == ZFS_PROP_MOUNTPOINT &&
 		    changelist_haszonedchild(cls[cl_idx])) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "child dataset with inherited mountpoint is used "
 			    "in a non-global zone"));
 			ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 			goto error;
 		}
 
 		if (cls[cl_idx] != NULL &&
 		    (ret = changelist_prefix(cls[cl_idx])) != 0)
 			goto error;
 
 		cl_idx++;
 	}
 	assert(cl_idx == nvl_len);
 
 	/*
 	 * Execute the corresponding ioctl() to set this list of properties.
 	 */
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	zcmd_write_src_nvlist(hdl, &zc, nvl);
 	zcmd_alloc_dst_nvlist(hdl, &zc, 0);
 
 	ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 
 	if (ret != 0) {
 		if (zc.zc_nvlist_dst_filled == B_FALSE) {
 			(void) zfs_standard_error(hdl, errno, errbuf);
 			goto error;
 		}
 
 		/* Get the list of unset properties back and report them. */
 		nvlist_t *errorprops = NULL;
 		if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
 			goto error;
 		for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL);
 		    elem != NULL;
 		    elem = nvlist_next_nvpair(errorprops, elem)) {
 			prop = zfs_name_to_prop(nvpair_name(elem));
 			zfs_setprop_error(hdl, prop, errno, errbuf);
 		}
 		nvlist_free(errorprops);
 
 		if (added_resv && errno == ENOSPC) {
 			/* clean up the volsize property we tried to set */
 			uint64_t old_volsize = zfs_prop_get_int(zhp,
 			    ZFS_PROP_VOLSIZE);
 			nvlist_free(nvl);
 			nvl = NULL;
 			zcmd_free_nvlists(&zc);
 
 			if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
 				goto error;
 			if (nvlist_add_uint64(nvl,
 			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
 			    old_volsize) != 0)
 				goto error;
 			zcmd_write_src_nvlist(hdl, &zc, nvl);
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 		}
 	} else {
 		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
 			if (cls[cl_idx] != NULL) {
 				int clp_err = changelist_postfix(cls[cl_idx]);
 				if (clp_err != 0)
 					ret = clp_err;
 			}
 		}
 
 		if (ret == 0) {
 			/*
 			 * Refresh the statistics so the new property
 			 * value is reflected.
 			 */
 			(void) get_stats(zhp);
 
 			/*
 			 * Remount the filesystem to propagate the change
 			 * if one of the options handled by the generic
 			 * Linux namespace layer has been modified.
 			 */
 			if (nsprop && zfs_is_mounted(zhp, NULL))
 				ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
 		}
 	}
 
 error:
 	nvlist_free(nvl);
 	zcmd_free_nvlists(&zc);
 	if (cls != NULL) {
 		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
 			if (cls[cl_idx] != NULL)
 				changelist_free(cls[cl_idx]);
 		}
 		free(cls);
 	}
 	return (ret);
 }
 
 /*
  * Given a property, inherit the value from the parent dataset, or if received
  * is TRUE, revert to the received value, if any.
  */
 int
 zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
 {
 	zfs_cmd_t zc = {"\0"};
 	int ret;
 	prop_changelist_t *cl;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[ERRBUFLEN];
 	zfs_prop_t prop;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
 	zc.zc_cookie = received;
 	if ((prop = zfs_name_to_prop(propname)) == ZPROP_USERPROP) {
 		/*
 		 * For user properties, the amount of work we have to do is very
 		 * small, so just do it here.
 		 */
 		if (!zfs_prop_user(propname)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 		if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0)
 			return (zfs_standard_error(hdl, errno, errbuf));
 
 		(void) get_stats(zhp);
 		return (0);
 	}
 
 	/*
 	 * Verify that this property is inheritable.
 	 */
 	if (zfs_prop_readonly(prop))
 		return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
 	if (!zfs_prop_inheritable(prop) && !received)
 		return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
 	/*
 	 * Check to see if the value applies to this type
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE))
 		return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
 	/*
 	 * Normalize the name, to get rid of shorthand abbreviations.
 	 */
 	propname = zfs_prop_to_name(prop);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value));
 
 	if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Determine datasets which will be affected by this change, if any.
 	 */
 	if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
 		return (-1);
 
 	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "child dataset with inherited mountpoint is used "
 		    "in a non-global zone"));
 		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
 		goto error;
 	}
 
 	if ((ret = changelist_prefix(cl)) != 0)
 		goto error;
 
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) {
 		changelist_free(cl);
 		return (zfs_standard_error(hdl, errno, errbuf));
 	} else {
 
 		if ((ret = changelist_postfix(cl)) != 0)
 			goto error;
 
 		/*
 		 * Refresh the statistics so the new property is reflected.
 		 */
 		(void) get_stats(zhp);
 
 		/*
 		 * Remount the filesystem to propagate the change
 		 * if one of the options handled by the generic
 		 * Linux namespace layer has been modified.
 		 */
 		if (zfs_is_namespace_prop(prop) &&
 		    zfs_is_mounted(zhp, NULL))
 			ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
 	}
 
 error:
 	changelist_free(cl);
 	return (ret);
 }
 
 /*
  * True DSL properties are stored in an nvlist.  The following two functions
  * extract them appropriately.
  */
 uint64_t
 getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, const char **source)
 {
 	nvlist_t *nv;
 	uint64_t value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		value = fnvlist_lookup_uint64(nv, ZPROP_VALUE);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
 		value = zfs_prop_default_numeric(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static const char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, const char **source)
 {
 	nvlist_t *nv;
 	const char *value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
 		value = fnvlist_lookup_string(nv, ZPROP_VALUE);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
 		value = zfs_prop_default_string(prop);
 		*source = "";
 	}
 
 	return (value);
 }
 
 static boolean_t
 zfs_is_recvd_props_mode(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_props != NULL &&
 	    zhp->zfs_props == zhp->zfs_recvd_props);
 }
 
 static void
 zfs_set_recvd_props_mode(zfs_handle_t *zhp, uintptr_t *cookie)
 {
 	*cookie = (uintptr_t)zhp->zfs_props;
 	zhp->zfs_props = zhp->zfs_recvd_props;
 }
 
 static void
 zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uintptr_t *cookie)
 {
 	zhp->zfs_props = (nvlist_t *)*cookie;
 	*cookie = 0;
 }
 
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
  *
  * Certain properties can be overridden using 'mount -o'.  In this case, scan
  * the contents of the /proc/self/mounts entry, searching for the
  * appropriate options. If they differ from the on-disk values, report the
  * current values and mark the source "temporary".
  */
 static int
 get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
     const char **source, uint64_t *val)
 {
 	zfs_cmd_t zc = {"\0"};
 	nvlist_t *zplprops = NULL;
 	struct mnttab mnt;
 	const char *mntopt_on = NULL;
 	const char *mntopt_off = NULL;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	*source = NULL;
 
 	/*
 	 * If the property is being fetched for a snapshot, check whether
 	 * the property is valid for the snapshot's head dataset type.
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT &&
 	    !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) {
 		*val = zfs_prop_default_numeric(prop);
 		return (-1);
 	}
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 		mntopt_on = MNTOPT_ATIME;
 		mntopt_off = MNTOPT_NOATIME;
 		break;
 
 	case ZFS_PROP_RELATIME:
 		mntopt_on = MNTOPT_RELATIME;
 		mntopt_off = MNTOPT_NORELATIME;
 		break;
 
 	case ZFS_PROP_DEVICES:
 		mntopt_on = MNTOPT_DEVICES;
 		mntopt_off = MNTOPT_NODEVICES;
 		break;
 
 	case ZFS_PROP_EXEC:
 		mntopt_on = MNTOPT_EXEC;
 		mntopt_off = MNTOPT_NOEXEC;
 		break;
 
 	case ZFS_PROP_READONLY:
 		mntopt_on = MNTOPT_RO;
 		mntopt_off = MNTOPT_RW;
 		break;
 
 	case ZFS_PROP_SETUID:
 		mntopt_on = MNTOPT_SETUID;
 		mntopt_off = MNTOPT_NOSETUID;
 		break;
 
 	case ZFS_PROP_XATTR:
 		mntopt_on = MNTOPT_XATTR;
 		mntopt_off = MNTOPT_NOXATTR;
 		break;
 
 	case ZFS_PROP_NBMAND:
 		mntopt_on = MNTOPT_NBMAND;
 		mntopt_off = MNTOPT_NONBMAND;
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Because looking up the mount options is potentially expensive
 	 * (iterating over all of /proc/self/mounts), we defer its
 	 * calculation until we're looking up a property which requires
 	 * its presence.
 	 */
 	if (!zhp->zfs_mntcheck &&
 	    (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
 		libzfs_handle_t *hdl = zhp->zfs_hdl;
 		struct mnttab entry;
 
 		if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)
 			zhp->zfs_mntopts = zfs_strdup(hdl,
 			    entry.mnt_mntopts);
 
 		zhp->zfs_mntcheck = B_TRUE;
 	}
 
 	if (zhp->zfs_mntopts == NULL)
 		mnt.mnt_mntopts = (char *)"";
 	else
 		mnt.mnt_mntopts = zhp->zfs_mntopts;
 
 	switch (prop) {
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_RELATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_SETUID:
 #ifndef __FreeBSD__
 	case ZFS_PROP_XATTR:
 #endif
 	case ZFS_PROP_NBMAND:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (received)
 			break;
 
 		if (hasmntopt(&mnt, mntopt_on) && !*val) {
 			*val = B_TRUE;
 			if (src)
 				*src = ZPROP_SRC_TEMPORARY;
 		} else if (hasmntopt(&mnt, mntopt_off) && *val) {
 			*val = B_FALSE;
 			if (src)
 				*src = ZPROP_SRC_TEMPORARY;
 		}
 		break;
 
 	case ZFS_PROP_CANMOUNT:
 	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 	case ZFS_PROP_FILESYSTEM_COUNT:
 	case ZFS_PROP_SNAPSHOT_COUNT:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (*source == NULL) {
 			/* not default, must be local */
 			*source = zhp->zfs_name;
 		}
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		*val = (zhp->zfs_mntopts != NULL);
 		break;
 
 	case ZFS_PROP_NUMCLONES:
 		*val = zhp->zfs_dmustats.dds_num_clones;
 		break;
 
 	case ZFS_PROP_VERSION:
 	case ZFS_PROP_NORMALIZE:
 	case ZFS_PROP_UTF8ONLY:
 	case ZFS_PROP_CASE:
 	case ZFS_PROP_DEFAULTUSERQUOTA:
 	case ZFS_PROP_DEFAULTGROUPQUOTA:
 	case ZFS_PROP_DEFAULTPROJECTQUOTA:
 	case ZFS_PROP_DEFAULTUSEROBJQUOTA:
 	case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
 	case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
 		zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0);
 
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
 			zcmd_free_nvlists(&zc);
 			if (prop == ZFS_PROP_VERSION &&
 			    zhp->zfs_type == ZFS_TYPE_VOLUME)
 				*val = zfs_prop_default_numeric(prop);
 			return (-1);
 		}
 		if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
 		    nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
 		    val) != 0) {
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
 		nvlist_free(zplprops);
 		zcmd_free_nvlists(&zc);
 		break;
 
 	case ZFS_PROP_INCONSISTENT:
 		*val = zhp->zfs_dmustats.dds_inconsistent;
 		break;
 
 	case ZFS_PROP_REDACTED:
 		*val = zhp->zfs_dmustats.dds_redacted;
 		break;
 
 	case ZFS_PROP_GUID:
 		if (zhp->zfs_dmustats.dds_guid != 0)
 			*val = zhp->zfs_dmustats.dds_guid;
 		else
 			*val = getprop_uint64(zhp, prop, source);
 		break;
 
 	case ZFS_PROP_CREATETXG:
 		/*
 		 * We can directly read createtxg property from zfs
 		 * handle for Filesystem, Snapshot and ZVOL types.
 		 */
 		if (((zhp->zfs_type == ZFS_TYPE_FILESYSTEM) ||
 		    (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) ||
 		    (zhp->zfs_type == ZFS_TYPE_VOLUME)) &&
 		    (zhp->zfs_dmustats.dds_creation_txg != 0)) {
 			*val = zhp->zfs_dmustats.dds_creation_txg;
 			break;
 		} else {
 			*val = getprop_uint64(zhp, prop, source);
 		}
 		zfs_fallthrough;
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
 		case PROP_TYPE_INDEX:
 			*val = getprop_uint64(zhp, prop, source);
 			/*
 			 * If we tried to use a default value for a
 			 * readonly property, it means that it was not
 			 * present.  Note this only applies to "truly"
 			 * readonly properties, not set-once properties
 			 * like volblocksize.
 			 */
 			if (zfs_prop_readonly(prop) &&
 			    !zfs_prop_setonce(prop) &&
 			    *source != NULL && (*source)[0] == '\0') {
 				*source = NULL;
 				return (-1);
 			}
 			break;
 
 		case PROP_TYPE_STRING:
 		default:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "cannot get non-numeric property"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP,
 			    dgettext(TEXT_DOMAIN, "internal error")));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Calculate the source type, given the raw source string.
  */
 static void
 get_source(zfs_handle_t *zhp, zprop_source_t *srctype, const char *source,
     char *statbuf, size_t statlen)
 {
 	if (statbuf == NULL ||
 	    srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) {
 		return;
 	}
 
 	if (source == NULL) {
 		*srctype = ZPROP_SRC_NONE;
 	} else if (source[0] == '\0') {
 		*srctype = ZPROP_SRC_DEFAULT;
 	} else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
 		*srctype = ZPROP_SRC_RECEIVED;
 	} else {
 		if (strcmp(source, zhp->zfs_name) == 0) {
 			*srctype = ZPROP_SRC_LOCAL;
 		} else {
 			(void) strlcpy(statbuf, source, statlen);
 			*srctype = ZPROP_SRC_INHERITED;
 		}
 	}
 
 }
 
 int
 zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
     size_t proplen, boolean_t literal)
 {
 	zfs_prop_t prop;
 	int err = 0;
 
 	if (zhp->zfs_recvd_props == NULL)
 		if (get_recvd_props_ioctl(zhp) != 0)
 			return (-1);
 
 	prop = zfs_name_to_prop(propname);
 
 	if (prop != ZPROP_USERPROP) {
 		uintptr_t cookie;
 		if (!nvlist_exists(zhp->zfs_recvd_props, propname))
 			return (-1);
 		zfs_set_recvd_props_mode(zhp, &cookie);
 		err = zfs_prop_get(zhp, prop, propbuf, proplen,
 		    NULL, NULL, 0, literal);
 		zfs_unset_recvd_props_mode(zhp, &cookie);
 	} else {
 		nvlist_t *propval;
 		const char *recvdval;
 		if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
 		    propname, &propval) != 0)
 			return (-1);
 		recvdval = fnvlist_lookup_string(propval, ZPROP_VALUE);
 		(void) strlcpy(propbuf, recvdval, proplen);
 	}
 
 	return (err == 0 ? 0 : -1);
 }
 
 static int
 get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen)
 {
 	nvlist_t *value;
 	nvpair_t *pair;
 
 	value = zfs_get_clones_nvl(zhp);
 	if (value == NULL || nvlist_empty(value))
 		return (-1);
 
 	propbuf[0] = '\0';
 	for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(value, pair)) {
 		if (propbuf[0] != '\0')
 			(void) strlcat(propbuf, ",", proplen);
 		(void) strlcat(propbuf, nvpair_name(pair), proplen);
 	}
 
 	return (0);
 }
 
 struct get_clones_arg {
 	uint64_t numclones;
 	nvlist_t *value;
 	const char *origin;
 	char buf[ZFS_MAX_DATASET_NAME_LEN];
 };
 
 static int
 get_clones_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct get_clones_arg *gca = arg;
 
 	if (gca->numclones == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
 	    NULL, NULL, 0, B_TRUE) != 0)
 		goto out;
 	if (strcmp(gca->buf, gca->origin) == 0) {
 		fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
 		gca->numclones--;
 	}
 
 out:
 	(void) zfs_iter_children_v2(zhp, 0, get_clones_cb, gca);
 	zfs_close(zhp);
 	return (0);
 }
 
 nvlist_t *
 zfs_get_clones_nvl(zfs_handle_t *zhp)
 {
 	nvlist_t *nv, *value;
 
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
 		struct get_clones_arg gca;
 
 		/*
 		 * if this is a snapshot, then the kernel wasn't able
 		 * to get the clones.  Do it by slowly iterating.
 		 */
 		if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
 			return (NULL);
 		if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
 			return (NULL);
 		if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
 			nvlist_free(nv);
 			return (NULL);
 		}
 
 		gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
 		gca.value = value;
 		gca.origin = zhp->zfs_name;
 
 		if (gca.numclones != 0) {
 			zfs_handle_t *root;
 			char pool[ZFS_MAX_DATASET_NAME_LEN];
 			char *cp = pool;
 
 			/* get the pool name */
 			(void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
 			(void) strsep(&cp, "/@");
 			root = zfs_open(zhp->zfs_hdl, pool,
 			    ZFS_TYPE_FILESYSTEM);
 			if (root == NULL) {
 				nvlist_free(nv);
 				nvlist_free(value);
 				return (NULL);
 			}
 
 			(void) get_clones_cb(root, &gca);
 		}
 
 		if (gca.numclones != 0 ||
 		    nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 ||
 		    nvlist_add_nvlist(zhp->zfs_props,
 		    zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
 			nvlist_free(nv);
 			nvlist_free(value);
 			return (NULL);
 		}
 		nvlist_free(nv);
 		nvlist_free(value);
 		nv = fnvlist_lookup_nvlist(zhp->zfs_props,
 		    zfs_prop_to_name(ZFS_PROP_CLONES));
 	}
 
 	return (fnvlist_lookup_nvlist(nv, ZPROP_VALUE));
 }
 
 static int
 get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen)
 {
 	nvlist_t *value;
 	uint64_t *snaps;
 	uint_t nsnaps;
 
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0)
 		return (-1);
 	if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps,
 	    &nsnaps) != 0)
 		return (-1);
 	if (nsnaps == 0) {
 		/* There's no redaction snapshots; pass a special value back */
 		(void) snprintf(propbuf, proplen, "none");
 		return (0);
 	}
 	propbuf[0] = '\0';
 	for (int i = 0; i < nsnaps; i++) {
 		char buf[128];
 		if (propbuf[0] != '\0')
 			(void) strlcat(propbuf, ",", proplen);
 		(void) snprintf(buf, sizeof (buf), "%llu",
 		    (u_longlong_t)snaps[i]);
 		(void) strlcat(propbuf, buf, proplen);
 	}
 
 	return (0);
 }
 
 /*
  * Accepts a property and value and checks that the value
  * matches the one found by the channel program. If they are
  * not equal, print both of them.
  */
 static void
 zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval,
     const char *strval)
 {
 	if (!zhp->zfs_hdl->libzfs_prop_debug)
 		return;
 	int error;
 	char *poolname = zhp->zpool_hdl->zpool_name;
 	const char *prop_name = zfs_prop_to_name(prop);
 	const char *program =
 	    "args = ...\n"
 	    "ds = args['dataset']\n"
 	    "prop = args['property']\n"
 	    "value, setpoint = zfs.get_prop(ds, prop)\n"
 	    "return {value=value, setpoint=setpoint}\n";
 	nvlist_t *outnvl;
 	nvlist_t *retnvl;
 	nvlist_t *argnvl = fnvlist_alloc();
 
 	fnvlist_add_string(argnvl, "dataset", zhp->zfs_name);
 	fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop));
 
 	error = lzc_channel_program_nosync(poolname, program,
 	    10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl);
 
 	if (error == 0) {
 		retnvl = fnvlist_lookup_nvlist(outnvl, "return");
 		if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) {
 			int64_t ans;
 			error = nvlist_lookup_int64(retnvl, "value", &ans);
 			if (error != 0) {
 				(void) fprintf(stderr, "%s: zcp check error: "
 				    "%u\n", prop_name, error);
 				return;
 			}
 			if (ans != intval) {
 				(void) fprintf(stderr, "%s: zfs found %llu, "
 				    "but zcp found %llu\n", prop_name,
 				    (u_longlong_t)intval, (u_longlong_t)ans);
 			}
 		} else {
 			const char *str_ans;
 			error = nvlist_lookup_string(retnvl, "value", &str_ans);
 			if (error != 0) {
 				(void) fprintf(stderr, "%s: zcp check error: "
 				    "%u\n", prop_name, error);
 				return;
 			}
 			if (strcmp(strval, str_ans) != 0) {
 				(void) fprintf(stderr,
 				    "%s: zfs found '%s', but zcp found '%s'\n",
 				    prop_name, strval, str_ans);
 			}
 		}
 	} else {
 		(void) fprintf(stderr, "%s: zcp check failed, channel program "
 		    "error: %u\n", prop_name, error);
 	}
 	nvlist_free(argnvl);
 	nvlist_free(outnvl);
 }
 
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
  * human-readable form.
  *
  * Returns 0 on success, or -1 on error.
  */
 int
 zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
     zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal)
 {
 	const char *source = NULL;
 	uint64_t val;
 	const char *str;
 	const char *strval;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE))
 		return (-1);
 
 	if (received && zfs_prop_readonly(prop))
 		return (-1);
 
 	if (src)
 		*src = ZPROP_SRC_NONE;
 
 	switch (prop) {
 	case ZFS_PROP_CREATION:
 		/*
 		 * 'creation' is a time_t stored in the statistics.  We convert
 		 * this into a string unless 'literal' is specified.
 		 */
 		{
 			val = getprop_uint64(zhp, prop, &source);
 			time_t time = (time_t)val;
 			struct tm t;
 
 			if (literal ||
 			    localtime_r(&time, &t) == NULL ||
 			    strftime(propbuf, proplen, "%a %b %e %k:%M %Y",
 			    &t) == 0)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 		}
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_MOUNTPOINT:
 		/*
 		 * Getting the precise mountpoint can be tricky.
 		 *
 		 *  - for 'none' or 'legacy', return those values.
 		 *  - for inherited mountpoints, we want to take everything
 		 *    after our ancestor and append it to the inherited value.
 		 *
 		 * If the pool has an alternate root, we want to prepend that
 		 * root to any values we return.
 		 */
 
 		str = getprop_string(zhp, prop, &source);
 
 		if (str[0] == '/') {
 			char buf[MAXPATHLEN];
 			char *root = buf;
 			const char *relpath;
 
 			/*
 			 * If we inherit the mountpoint, even from a dataset
 			 * with a received value, the source will be the path of
 			 * the dataset we inherit from. If source is
 			 * ZPROP_SOURCE_VAL_RECVD, the received value is not
 			 * inherited.
 			 */
 			if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
 				relpath = "";
 			} else {
 				relpath = zhp->zfs_name + strlen(source);
 				if (relpath[0] == '/')
 					relpath++;
 			}
 
 			if ((zpool_get_prop(zhp->zpool_hdl,
 			    ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
 			    B_FALSE)) || (strcmp(root, "-") == 0))
 				root[0] = '\0';
 			/*
 			 * Special case an alternate root of '/'. This will
 			 * avoid having multiple leading slashes in the
 			 * mountpoint path.
 			 */
 			if (strcmp(root, "/") == 0)
 				root++;
 
 			/*
 			 * If the mountpoint is '/' then skip over this
 			 * if we are obtaining either an alternate root or
 			 * an inherited mountpoint.
 			 */
 			if (str[1] == '\0' && (root[0] != '\0' ||
 			    relpath[0] != '\0'))
 				str++;
 
 			if (relpath[0] == '\0')
 				(void) snprintf(propbuf, proplen, "%s%s",
 				    root, str);
 			else
 				(void) snprintf(propbuf, proplen, "%s%s%s%s",
 				    root, str, relpath[0] == '@' ? "" : "/",
 				    relpath);
 		} else {
 			/* 'legacy' or 'none' */
 			(void) strlcpy(propbuf, str, proplen);
 		}
 		zcp_check(zhp, prop, 0, propbuf);
 		break;
 
 	case ZFS_PROP_ORIGIN:
 		if (*zhp->zfs_dmustats.dds_origin != '\0') {
 			str = (char *)&zhp->zfs_dmustats.dds_origin;
 		} else {
 			str = getprop_string(zhp, prop, &source);
 		}
 		if (str == NULL || *str == '\0')
 			str = zfs_prop_default_string(prop);
 		if (str == NULL)
 			return (-1);
 		(void) strlcpy(propbuf, str, proplen);
 		zcp_check(zhp, prop, 0, str);
 		break;
 
 	case ZFS_PROP_REDACT_SNAPS:
 		if (get_rsnaps_string(zhp, propbuf, proplen) != 0)
 			return (-1);
 		break;
 
 	case ZFS_PROP_CLONES:
 		if (get_clones_string(zhp, propbuf, proplen) != 0)
 			return (-1);
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
 
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		/*
 		 * If quota or reservation is 0, we translate this into 'none'
 		 * (unless literal is set), and indicate that it's the default
 		 * value.  Otherwise, we print the number nicely and indicate
 		 * that its set locally.
 		 */
 		if (val == 0) {
 			if (literal)
 				(void) strlcpy(propbuf, "0", proplen);
 			else
 				(void) strlcpy(propbuf, "none", proplen);
 		} else {
 			if (literal)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			else
 				zfs_nicebytes(val, propbuf, proplen);
 		}
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 	case ZFS_PROP_FILESYSTEM_COUNT:
 	case ZFS_PROP_SNAPSHOT_COUNT:
 
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 
 		/*
 		 * If limit is UINT64_MAX, we translate this into 'none', and
 		 * indicate that it's the default value. Otherwise, we print
 		 * the number nicely and indicate that it's set locally.
 		 */
 		if (val == UINT64_MAX) {
 			(void) strlcpy(propbuf, "none", proplen);
 		} else if (literal) {
 			(void) snprintf(propbuf, proplen, "%llu",
 			    (u_longlong_t)val);
 		} else {
 			zfs_nicenum(val, propbuf, proplen);
 		}
 
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		if (literal)
 			(void) snprintf(propbuf, proplen, "%llu.%02llu",
 			    (u_longlong_t)(val / 100),
 			    (u_longlong_t)(val % 100));
 		else
 			(void) snprintf(propbuf, proplen, "%llu.%02llux",
 			    (u_longlong_t)(val / 100),
 			    (u_longlong_t)(val % 100));
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_TYPE:
 		switch (zhp->zfs_type) {
 		case ZFS_TYPE_FILESYSTEM:
 			str = "filesystem";
 			break;
 		case ZFS_TYPE_VOLUME:
 			str = "volume";
 			break;
 		case ZFS_TYPE_SNAPSHOT:
 			str = "snapshot";
 			break;
 		case ZFS_TYPE_BOOKMARK:
 			str = "bookmark";
 			break;
 		default:
 			abort();
 		}
 		(void) snprintf(propbuf, proplen, "%s", str);
 		zcp_check(zhp, prop, 0, propbuf);
 		break;
 
 	case ZFS_PROP_MOUNTED:
 		/*
 		 * The 'mounted' property is a pseudo-property that described
 		 * whether the filesystem is currently mounted.  Even though
 		 * it's a boolean value, the typical values of "on" and "off"
 		 * don't make sense, so we translate to "yes" and "no".
 		 */
 		if (get_numeric_property(zhp, ZFS_PROP_MOUNTED,
 		    src, &source, &val) != 0)
 			return (-1);
 		if (val)
 			(void) strlcpy(propbuf, "yes", proplen);
 		else
 			(void) strlcpy(propbuf, "no", proplen);
 		break;
 
 	case ZFS_PROP_NAME:
 		/*
 		 * The 'name' property is a pseudo-property derived from the
 		 * dataset name.  It is presented as a real property to simplify
 		 * consumers.
 		 */
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		zcp_check(zhp, prop, 0, propbuf);
 		break;
 
 	case ZFS_PROP_MLSLABEL:
 		{
 #ifdef HAVE_MLSLABEL
 			m_label_t *new_sl = NULL;
 			char *ascii = NULL;	/* human readable label */
 
 			(void) strlcpy(propbuf,
 			    getprop_string(zhp, prop, &source), proplen);
 
 			if (literal || (strcasecmp(propbuf,
 			    ZFS_MLSLABEL_DEFAULT) == 0))
 				break;
 
 			/*
 			 * Try to translate the internal hex string to
 			 * human-readable output.  If there are any
 			 * problems just use the hex string.
 			 */
 
 			if (str_to_label(propbuf, &new_sl, MAC_LABEL,
 			    L_NO_CORRECTION, NULL) == -1) {
 				m_label_free(new_sl);
 				break;
 			}
 
 			if (label_to_str(new_sl, &ascii, M_LABEL,
 			    DEF_NAMES) != 0) {
 				if (ascii)
 					free(ascii);
 				m_label_free(new_sl);
 				break;
 			}
 			m_label_free(new_sl);
 
 			(void) strlcpy(propbuf, ascii, proplen);
 			free(ascii);
 #else
 			(void) strlcpy(propbuf,
 			    getprop_string(zhp, prop, &source), proplen);
 #endif /* HAVE_MLSLABEL */
 		}
 		break;
 
 	case ZFS_PROP_GUID:
 	case ZFS_PROP_KEY_GUID:
 	case ZFS_PROP_IVSET_GUID:
 	case ZFS_PROP_CREATETXG:
 	case ZFS_PROP_OBJSETID:
 	case ZFS_PROP_PBKDF2_ITERS:
 		/*
 		 * These properties are stored as numbers, but they are
 		 * identifiers or counters.
 		 * We don't want them to be pretty printed, because pretty
 		 * printing truncates their values making them useless.
 		 */
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		(void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_REFERENCED:
 	case ZFS_PROP_AVAILABLE:
 	case ZFS_PROP_USED:
 	case ZFS_PROP_USEDSNAP:
 	case ZFS_PROP_USEDDS:
 	case ZFS_PROP_USEDREFRESERV:
 	case ZFS_PROP_USEDCHILD:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
 		if (literal) {
 			(void) snprintf(propbuf, proplen, "%llu",
 			    (u_longlong_t)val);
 		} else {
 			zfs_nicebytes(val, propbuf, proplen);
 		}
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	case ZFS_PROP_SNAPSHOTS_CHANGED:
 		{
 			if ((get_numeric_property(zhp, prop, src, &source,
 			    &val) != 0) || val == 0) {
 				return (-1);
 			}
 
 			time_t time = (time_t)val;
 			struct tm t;
 
 			if (literal ||
 			    localtime_r(&time, &t) == NULL ||
 			    strftime(propbuf, proplen, "%a %b %e %k:%M:%S %Y",
 			    &t) == 0)
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 		}
 		zcp_check(zhp, prop, val, NULL);
 		break;
 
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
 			if (get_numeric_property(zhp, prop, src,
 			    &source, &val) != 0) {
 				return (-1);
 			}
 
 			if (literal) {
 				(void) snprintf(propbuf, proplen, "%llu",
 				    (u_longlong_t)val);
 			} else {
 				zfs_nicenum(val, propbuf, proplen);
 			}
 			zcp_check(zhp, prop, val, NULL);
 			break;
 
 		case PROP_TYPE_STRING:
 			str = getprop_string(zhp, prop, &source);
 			if (str == NULL)
 				return (-1);
 
 			(void) strlcpy(propbuf, str, proplen);
 			zcp_check(zhp, prop, 0, str);
 			break;
 
 		case PROP_TYPE_INDEX:
 			if (get_numeric_property(zhp, prop, src,
 			    &source, &val) != 0)
 				return (-1);
 			if (zfs_prop_index_to_string(prop, val, &strval) != 0)
 				return (-1);
 
 			(void) strlcpy(propbuf, strval, proplen);
 			zcp_check(zhp, prop, 0, strval);
 			break;
 
 		default:
 			abort();
 		}
 	}
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 /*
  * Utility function to get the given numeric property.  Does no validation that
  * the given property is the appropriate type; should only be used with
  * hard-coded property types.
  */
 uint64_t
 zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
 {
 	const char *source;
 	uint64_t val = 0;
 
 	(void) get_numeric_property(zhp, prop, NULL, &source, &val);
 
 	return (val);
 }
 
 static int
 zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
 {
 	char buf[64];
 
 	(void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
 	return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
 }
 
 /*
  * Similar to zfs_prop_get(), but returns the value as an integer.
  */
 int
 zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
     zprop_source_t *src, char *statbuf, size_t statlen)
 {
 	const char *source;
 
 	/*
 	 * Check to see if this property applies to our object
 	 */
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) {
 		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
 		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
 		    zfs_prop_to_name(prop)));
 	}
 
 	if (src)
 		*src = ZPROP_SRC_NONE;
 
 	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
 		return (-1);
 
 	get_source(zhp, src, source, statbuf, statlen);
 
 	return (0);
 }
 
 #ifdef HAVE_IDMAP
 static int
 idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
     char **domainp, idmap_rid_t *ridp)
 {
 	idmap_get_handle_t *get_hdl = NULL;
 	idmap_stat status;
 	int err = EINVAL;
 
 	if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
 		goto out;
 
 	if (isuser) {
 		err = idmap_get_sidbyuid(get_hdl, id,
 		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
 	} else {
 		err = idmap_get_sidbygid(get_hdl, id,
 		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
 	}
 	if (err == IDMAP_SUCCESS &&
 	    idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
 	    status == IDMAP_SUCCESS)
 		err = 0;
 	else
 		err = EINVAL;
 out:
 	if (get_hdl)
 		idmap_get_destroy(get_hdl);
 	return (err);
 }
 #endif /* HAVE_IDMAP */
 
 /*
  * convert the propname into parameters needed by kernel
  * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
  * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
  * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234
  * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234
  * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123
  * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789
  */
 static int
 userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp)
 {
 	zfs_userquota_prop_t type;
 	char *cp;
 	boolean_t isuser;
 	boolean_t isgroup;
 	boolean_t isproject;
 	struct passwd *pw;
 	struct group *gr;
 
 	domain[0] = '\0';
 
 	/* Figure out the property type ({user|group|project}{quota|space}) */
 	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
 		if (strncmp(propname, zfs_userquota_prop_prefixes[type],
 		    strlen(zfs_userquota_prop_prefixes[type])) == 0)
 			break;
 	}
 	if (type == ZFS_NUM_USERQUOTA_PROPS)
 		return (EINVAL);
 	*typep = type;
 
 	isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED ||
 	    type == ZFS_PROP_USEROBJQUOTA ||
 	    type == ZFS_PROP_USEROBJUSED);
 	isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED ||
 	    type == ZFS_PROP_GROUPOBJQUOTA ||
 	    type == ZFS_PROP_GROUPOBJUSED);
 	isproject = (type == ZFS_PROP_PROJECTQUOTA ||
 	    type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA ||
 	    type == ZFS_PROP_PROJECTOBJUSED);
 
 	cp = strchr(propname, '@') + 1;
 
 	if (isuser &&
 	    getpwnam_r(cp, &gpwd, rpbuf, sizeof (rpbuf), &pw) == 0 &&
 	    pw != NULL) {
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		*ridp = pw->pw_uid;
 	} else if (isgroup &&
 	    getgrnam_r(cp, &ggrp, rpbuf, sizeof (rpbuf), &gr) == 0 &&
 	    gr != NULL) {
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		*ridp = gr->gr_gid;
 	} else if (!isproject && strchr(cp, '@')) {
 #ifdef HAVE_IDMAP
 		/*
 		 * It's a SID name (eg "user@domain") that needs to be
 		 * turned into S-1-domainID-RID.
 		 */
 		directory_error_t e;
 		char *numericsid = NULL;
 		char *end;
 
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		if (isuser) {
 			e = directory_sid_from_user_name(NULL,
 			    cp, &numericsid);
 		} else {
 			e = directory_sid_from_group_name(NULL,
 			    cp, &numericsid);
 		}
 		if (e != NULL) {
 			directory_error_free(e);
 			return (ENOENT);
 		}
 		if (numericsid == NULL)
 			return (ENOENT);
 		cp = numericsid;
 		(void) strlcpy(domain, cp, domainlen);
 		cp = strrchr(domain, '-');
 		*cp = '\0';
 		cp++;
 
 		errno = 0;
 		*ridp = strtoull(cp, &end, 10);
 		free(numericsid);
 
 		if (errno != 0 || *end != '\0')
 			return (EINVAL);
 #else
 		(void) domainlen;
 		return (ENOSYS);
 #endif /* HAVE_IDMAP */
 	} else {
 		/* It's a user/group/project ID (eg "12345"). */
 		uid_t id;
 		char *end;
 		id = strtoul(cp, &end, 10);
 		if (*end != '\0')
 			return (EINVAL);
 		if (id > MAXUID && !isproject) {
 #ifdef HAVE_IDMAP
 			/* It's an ephemeral ID. */
 			idmap_rid_t rid;
 			char *mapdomain;
 
 			if (idmap_id_to_numeric_domain_rid(id, isuser,
 			    &mapdomain, &rid) != 0)
 				return (ENOENT);
 			(void) strlcpy(domain, mapdomain, domainlen);
 			*ridp = rid;
 #else
 			return (ENOSYS);
 #endif /* HAVE_IDMAP */
 		} else {
 			*ridp = id;
 		}
 	}
 
 	return (0);
 }
 
 static int
 zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue, zfs_userquota_prop_t *typep)
 {
 	int err;
 	zfs_cmd_t zc = {"\0"};
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	err = userquota_propname_decode(propname,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
 	    typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
 	zc.zc_objset_type = *typep;
 	if (err)
 		return (err);
 
 	err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc);
 	if (err)
 		return (err);
 
 	*propvalue = zc.zc_cookie;
 	return (0);
 }
 
 int
 zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue)
 {
 	zfs_userquota_prop_t type;
 
 	return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
 	    &type));
 }
 
 int
 zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal)
 {
 	int err;
 	uint64_t propvalue;
 	zfs_userquota_prop_t type;
 
 	err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
 	    &type);
 
 	if (err)
 		return (err);
 
 	if (literal) {
 		(void) snprintf(propbuf, proplen, "%llu",
 		    (u_longlong_t)propvalue);
 	} else if (propvalue == 0 &&
 	    (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA ||
 	    type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
 	    type == ZFS_PROP_PROJECTQUOTA ||
 	    type == ZFS_PROP_PROJECTOBJQUOTA)) {
 		(void) strlcpy(propbuf, "none", proplen);
 	} else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA ||
 	    type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED ||
 	    type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) {
 		zfs_nicebytes(propvalue, propbuf, proplen);
 	} else {
 		zfs_nicenum(propvalue, propbuf, proplen);
 	}
 	return (0);
 }
 
 /*
  * propname must start with "written@" or "written#".
  */
 int
 zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
     uint64_t *propvalue)
 {
 	int err;
 	zfs_cmd_t zc = {"\0"};
 	const char *snapname;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	assert(zfs_prop_written(propname));
 	snapname = propname + strlen("written@");
 	if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) {
 		/* full snapshot or bookmark name specified */
 		(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 	} else {
 		/* snapname is the short name, append it to zhp's fsname */
 		char *cp;
 
 		(void) strlcpy(zc.zc_value, zhp->zfs_name,
 		    sizeof (zc.zc_value));
 		cp = strchr(zc.zc_value, '@');
 		if (cp != NULL)
 			*cp = '\0';
 		(void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value));
 	}
 
 	err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc);
 	if (err)
 		return (err);
 
 	*propvalue = zc.zc_cookie;
 	return (0);
 }
 
 int
 zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal)
 {
 	int err;
 	uint64_t propvalue;
 
 	err = zfs_prop_get_written_int(zhp, propname, &propvalue);
 
 	if (err)
 		return (err);
 
 	if (literal) {
 		(void) snprintf(propbuf, proplen, "%llu",
 		    (u_longlong_t)propvalue);
 	} else {
 		zfs_nicebytes(propvalue, propbuf, proplen);
 	}
 
 	return (0);
 }
 
 /*
  * Returns the name of the given zfs handle.
  */
 const char *
 zfs_get_name(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_name);
 }
 
 /*
  * Returns the name of the parent pool for the given zfs handle.
  */
 const char *
 zfs_get_pool_name(const zfs_handle_t *zhp)
 {
 	return (zhp->zpool_hdl->zpool_name);
 }
 
 /*
  * Returns the type of the given zfs handle.
  */
 zfs_type_t
 zfs_get_type(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_type);
 }
 
 /*
  * Returns the type of the given zfs handle,
  * or, if a snapshot, the type of the snapshotted dataset.
  */
 zfs_type_t
 zfs_get_underlying_type(const zfs_handle_t *zhp)
 {
 	return (zhp->zfs_head_type);
 }
 
 /*
  * Is one dataset name a child dataset of another?
  *
  * Needs to handle these cases:
  * Dataset 1	"a/foo"		"a/foo"		"a/foo"		"a/foo"
  * Dataset 2	"a/fo"		"a/foobar"	"a/bar/baz"	"a/foo/bar"
  * Descendant?	No.		No.		No.		Yes.
  */
 static boolean_t
 is_descendant(const char *ds1, const char *ds2)
 {
 	size_t d1len = strlen(ds1);
 
 	/* ds2 can't be a descendant if it's smaller */
 	if (strlen(ds2) < d1len)
 		return (B_FALSE);
 
 	/* otherwise, compare strings and verify that there's a '/' char */
 	return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
 }
 
 /*
  * Given a complete name, return just the portion that refers to the parent.
  * Will return -1 if there is no parent (path is just the name of the
  * pool).
  */
 static int
 parent_name(const char *path, char *buf, size_t buflen)
 {
 	char *slashp;
 
 	(void) strlcpy(buf, path, buflen);
 
 	if ((slashp = strrchr(buf, '/')) == NULL)
 		return (-1);
 	*slashp = '\0';
 
 	return (0);
 }
 
 int
 zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen)
 {
 	return (parent_name(zfs_get_name(zhp), buf, buflen));
 }
 
 /*
  * If accept_ancestor is false, then check to make sure that the given path has
  * a parent, and that it exists.  If accept_ancestor is true, then find the
  * closest existing ancestor for the given path.  In prefixlen return the
  * length of already existing prefix of the given path.  We also fetch the
  * 'zoned' property, which is used to validate property settings when creating
  * new datasets.
  */
 static int
 check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
     boolean_t accept_ancestor, int *prefixlen)
 {
 	zfs_cmd_t zc = {"\0"};
 	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[ERRBUFLEN];
 	uint64_t is_zoned;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
 
 	/* get parent, and check to see if this is just a pool */
 	if (parent_name(path, parent, sizeof (parent)) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "missing dataset name"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/* check to see if the pool exists */
 	if ((slash = strchr(parent, '/')) == NULL)
 		slash = parent + strlen(parent);
 	(void) strlcpy(zc.zc_name, parent,
 	    MIN(sizeof (zc.zc_name), slash - parent + 1));
 	if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 &&
 	    errno == ENOENT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "no such pool '%s'"), zc.zc_name);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
 
 	/* check to see if the parent dataset exists */
 	while ((zhp = make_dataset_handle(hdl, parent)) == NULL) {
 		if (errno == ENOENT && accept_ancestor) {
 			/*
 			 * Go deeper to find an ancestor, give up on top level.
 			 */
 			if (parent_name(parent, parent, sizeof (parent)) != 0) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "no such pool '%s'"), zc.zc_name);
 				return (zfs_error(hdl, EZFS_NOENT, errbuf));
 			}
 		} else if (errno == ENOENT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "parent does not exist"));
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		} else
 			return (zfs_standard_error(hdl, errno, errbuf));
 	}
 
 	is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 	if (zoned != NULL)
 		*zoned = is_zoned;
 
 	/* we are in a non-global zone, but parent is in the global zone */
 	if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
 		(void) zfs_standard_error(hdl, EPERM, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	/* make sure parent is a filesystem */
 	if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "parent is not a filesystem"));
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
 		return (-1);
 	}
 
 	zfs_close(zhp);
 	if (prefixlen != NULL)
 		*prefixlen = strlen(parent);
 	return (0);
 }
 
 /*
  * Finds whether the dataset of the given type(s) exists.
  */
 boolean_t
 zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types)
 {
 	zfs_handle_t *zhp;
 
 	if (!zfs_validate_name(hdl, path, types, B_FALSE))
 		return (B_FALSE);
 
 	/*
 	 * Try to get stats for the dataset, which will tell us if it exists.
 	 */
 	if ((zhp = make_dataset_handle(hdl, path)) != NULL) {
 		int ds_type = zhp->zfs_type;
 
 		zfs_close(zhp);
 		if (types & ds_type)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Given a path to 'target', create all the ancestors between
  * the prefixlen portion of the path, and the target itself.
  * Fail if the initial prefixlen-ancestor does not already exist.
  */
 int
 create_parents(libzfs_handle_t *hdl, char *target, int prefixlen)
 {
 	zfs_handle_t *h;
 	char *cp;
 	const char *opname;
 
 	/* make sure prefix exists */
 	cp = target + prefixlen;
 	if (*cp != '/') {
 		assert(strchr(cp, '/') == NULL);
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 	} else {
 		*cp = '\0';
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		*cp = '/';
 	}
 	if (h == NULL)
 		return (-1);
 	zfs_close(h);
 
 	/*
 	 * Attempt to create, mount, and share any ancestor filesystems,
 	 * up to the prefixlen-long one.
 	 */
 	for (cp = target + prefixlen + 1;
 	    (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) {
 
 		*cp = '\0';
 
 		h = make_dataset_handle(hdl, target);
 		if (h) {
 			/* it already exists, nothing to do here */
 			zfs_close(h);
 			continue;
 		}
 
 		if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
 		    NULL) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "create");
 			goto ancestorerr;
 		}
 
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		if (h == NULL) {
 			opname = dgettext(TEXT_DOMAIN, "open");
 			goto ancestorerr;
 		}
 
 		if (zfs_mount(h, NULL, 0) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "mount");
 			goto ancestorerr;
 		}
 
 		if (zfs_share(h, NULL) != 0) {
 			opname = dgettext(TEXT_DOMAIN, "share");
 			goto ancestorerr;
 		}
 
 		zfs_close(h);
 	}
 	zfs_commit_shares(NULL);
 
 	return (0);
 
 ancestorerr:
 	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 	    "failed to %s ancestor '%s'"), opname, target);
 	return (-1);
 }
 
 /*
  * Creates non-existing ancestors of the given path.
  */
 int
 zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
 {
 	int prefix;
 	char *path_copy;
 	char errbuf[ERRBUFLEN];
 	int rc = 0;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
 
 	/*
 	 * Check that we are not passing the nesting limit
 	 * before we start creating any ancestors.
 	 */
 	if (dataset_nestcheck(path) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "maximum name nesting depth exceeded"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
 		return (-1);
 
 	if ((path_copy = strdup(path)) != NULL) {
 		rc = create_parents(hdl, path_copy, prefix);
 		free(path_copy);
 	}
 	if (path_copy == NULL || rc != 0)
 		return (-1);
 
 	return (0);
 }
 
 /*
  * Create a new filesystem or volume.
  */
 int
 zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
     nvlist_t *props)
 {
 	int ret;
 	uint64_t size = 0;
 	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 	uint64_t zoned;
 	enum lzc_dataset_type ost;
 	zpool_handle_t *zpool_handle;
 	uint8_t *wkeydata = NULL;
 	uint_t wkeylen = 0;
 	char errbuf[ERRBUFLEN];
 	char parent[ZFS_MAX_DATASET_NAME_LEN];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
 
 	/* validate the path, taking care to note the extended error message */
 	if (!zfs_validate_name(hdl, path, type, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	if (dataset_nestcheck(path) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "maximum name nesting depth exceeded"));
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	}
 
 	/* validate parents exist */
 	if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0)
 		return (-1);
 
 	/*
 	 * The failure modes when creating a dataset of a different type over
 	 * one that already exists is a little strange.  In particular, if you
 	 * try to create a dataset on top of an existing dataset, the ioctl()
 	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
 	 * first try to see if the dataset exists.
 	 */
 	if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 	}
 
 	if (type == ZFS_TYPE_VOLUME)
 		ost = LZC_DATSET_TYPE_ZVOL;
 	else
 		ost = LZC_DATSET_TYPE_ZFS;
 
 	/* open zpool handle for prop validation */
 	char pool_path[ZFS_MAX_DATASET_NAME_LEN];
 	(void) strlcpy(pool_path, path, sizeof (pool_path));
 
 	/* truncate pool_path at first slash */
 	char *p = strchr(pool_path, '/');
 	if (p != NULL)
 		*p = '\0';
 
 	if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL)
 		return (-1);
 
 	if (props && (props = zfs_valid_proplist(hdl, type, props,
 	    zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) {
 		zpool_close(zpool_handle);
 		return (-1);
 	}
 	zpool_close(zpool_handle);
 
 	if (type == ZFS_TYPE_VOLUME) {
 		/*
 		 * If we are creating a volume, the size and block size must
 		 * satisfy a few restraints.  First, the blocksize must be a
 		 * valid block size between SPA_{MIN,MAX}BLOCKSIZE.  Second, the
 		 * volsize must be a multiple of the block size, and cannot be
 		 * zero.
 		 */
 		if (props == NULL || nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "missing volume size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if ((ret = nvlist_lookup_uint64(props,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &blocksize)) != 0) {
 			if (ret == ENOENT) {
 				blocksize = zfs_prop_default_numeric(
 				    ZFS_PROP_VOLBLOCKSIZE);
 			} else {
 				nvlist_free(props);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "missing volume block size"));
 				return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 			}
 		}
 
 		if (size == 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size cannot be zero"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 
 		if (size % blocksize != 0) {
 			nvlist_free(props);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "volume size must be a multiple of volume block "
 			    "size"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 		}
 	}
 
 	(void) parent_name(path, parent, sizeof (parent));
 	if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE,
 	    &wkeydata, &wkeylen) != 0) {
 		nvlist_free(props);
 		return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 	}
 
 	/* create the dataset */
 	ret = lzc_create(path, ost, props, wkeydata, wkeylen);
 	nvlist_free(props);
 	if (wkeydata != NULL)
 		free(wkeydata);
 
 	/* check for failure */
 	if (ret != 0) {
 		switch (errno) {
 		case ENOENT:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to set this "
 			    "property or value"));
 			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
 
 		case EACCES:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "encryption root's key is not loaded "
 			    "or provided"));
 			return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 
 		case ERANGE:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property value(s) specified"));
 			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
 #ifdef _ILP32
 		case EOVERFLOW:
 			/*
 			 * This platform can't address a volume this big.
 			 */
 			if (type == ZFS_TYPE_VOLUME)
 				return (zfs_error(hdl, EZFS_VOLTOOBIG,
 				    errbuf));
 			zfs_fallthrough;
 #endif
 		default:
 			return (zfs_standard_error(hdl, errno, errbuf));
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroys the given dataset.  The caller must make sure that the filesystem
  * isn't mounted, and that there are no active dependents. If the file system
  * does not exist this function does nothing.
  */
 int
 zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
 	int error;
 
 	if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer)
 		return (EINVAL);
 
 	if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
 		nvlist_t *nv = fnvlist_alloc();
 		fnvlist_add_boolean(nv, zhp->zfs_name);
 		error = lzc_destroy_bookmarks(nv, NULL);
 		fnvlist_free(nv);
 		if (error != 0) {
 			return (zfs_standard_error_fmt(zhp->zfs_hdl, error,
 			    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 			    zhp->zfs_name));
 		}
 		return (0);
 	}
 
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		nvlist_t *nv = fnvlist_alloc();
 		fnvlist_add_boolean(nv, zhp->zfs_name);
 		error = lzc_destroy_snaps(nv, defer, NULL);
 		fnvlist_free(nv);
 	} else {
 		error = lzc_destroy(zhp->zfs_name);
 	}
 
 	if (error != 0 && error != ENOENT) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 		    zhp->zfs_name));
 	}
 
 	remove_mountpoint(zhp);
 
 	return (0);
 }
 
 struct destroydata {
 	nvlist_t *nvl;
 	const char *snapname;
 };
 
 static int
 zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int rv = 0;
 
 	if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name,
 	    dd->snapname) >= sizeof (name))
 		return (EINVAL);
 
 	if (lzc_exists(name))
 		fnvlist_add_boolean(dd->nvl, name);
 
 	rv = zfs_iter_filesystems_v2(zhp, 0, zfs_check_snap_cb, dd);
 	zfs_close(zhp);
 	return (rv);
 }
 
 /*
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
 zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
 {
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
 	dd.nvl = fnvlist_alloc();
 	(void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);
 
 	if (nvlist_empty(dd.nvl)) {
 		ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
 		    zhp->zfs_name, snapname);
 	} else {
 		ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
 	}
 	fnvlist_free(dd.nvl);
 	return (ret);
 }
 
 /*
  * Destroys all the snapshots named in the nvlist.
  */
 int
 zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
 {
 	nvlist_t *errlist = NULL;
 	nvpair_t *pair;
 
 	int ret = zfs_destroy_snaps_nvl_os(hdl, snaps);
 	if (ret != 0)
 		return (ret);
 
 	ret = lzc_destroy_snaps(snaps, defer, &errlist);
 
 	if (ret == 0) {
 		nvlist_free(errlist);
 		return (0);
 	}
 
 	if (nvlist_empty(errlist)) {
 		char errbuf[ERRBUFLEN];
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
 
 		ret = zfs_standard_error(hdl, ret, errbuf);
 	}
 	for (pair = nvlist_next_nvpair(errlist, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
 		char errbuf[ERRBUFLEN];
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
 		    nvpair_name(pair));
 
 		switch (fnvpair_value_int32(pair)) {
 		case EEXIST:
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "snapshot is cloned"));
 			ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
 			break;
 		case EBUSY: {
 			nvlist_t *existing_holds;
 			int err = lzc_get_holds(nvpair_name(pair),
 			    &existing_holds);
 
 			/* check the presence of holders */
 			if (err == 0 && !nvlist_empty(existing_holds)) {
 				zfs_error_aux(hdl,
 				    dgettext(TEXT_DOMAIN, "it's being held. "
 				    "Run 'zfs holds -r %s' to see holders."),
 				    nvpair_name(pair));
 				ret = zfs_error(hdl, EBUSY, errbuf);
 			} else {
 				ret = zfs_standard_error(hdl, errno, errbuf);
 			}
 
 			if (err == 0)
 				nvlist_free(existing_holds);
 			break;
 		}
 		default:
 			ret = zfs_standard_error(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	nvlist_free(errlist);
 	return (ret);
 }
 
 /*
  * Clones the given dataset.  The target must be of the same type as the source.
  */
 int
 zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 {
 	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	int ret;
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	uint64_t zoned;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), target);
 
 	/* validate the target/clone name */
 	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/* validate parents exist */
 	if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0)
 		return (-1);
 
 	(void) parent_name(target, parent, sizeof (parent));
 
 	/* do the clone */
 
 	if (props) {
 		zfs_type_t type = ZFS_TYPE_FILESYSTEM;
 
 		if (ZFS_IS_VOLUME(zhp))
 			type = ZFS_TYPE_VOLUME;
 		if ((props = zfs_valid_proplist(hdl, type, props, zoned,
 		    zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL)
 			return (-1);
 		if (zfs_fix_auto_resv(zhp, props) == -1) {
 			nvlist_free(props);
 			return (-1);
 		}
 	}
 
 	if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) {
 		nvlist_free(props);
 		return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf));
 	}
 
 	ret = lzc_clone(target, zhp->zfs_name, props);
 	nvlist_free(props);
 
 	if (ret != 0) {
 		switch (errno) {
 
 		case ENOENT:
 			/*
 			 * The parent doesn't exist.  We should have caught this
 			 * above, but there may a race condition that has since
 			 * destroyed the parent.
 			 *
 			 * At this point, we don't know whether it's the source
 			 * that doesn't exist anymore, or whether the target
 			 * dataset doesn't exist.
 			 */
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "no such parent '%s'"), parent);
 			return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 
 		case EXDEV:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "source and target pools differ"));
 			return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET,
 			    errbuf));
 
 		default:
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
 	}
 
 	return (ret);
 }
 
 /*
  * Promotes the given clone fs to be the clone parent.
  */
 int
 zfs_promote(zfs_handle_t *zhp)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	int ret;
 	char errbuf[ERRBUFLEN];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot promote '%s'"), zhp->zfs_name);
 
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "snapshots can not be promoted"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	if (zhp->zfs_dmustats.dds_origin[0] == '\0') {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
 
 	if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname));
 
 	if (ret != 0) {
 		switch (ret) {
 		case EACCES:
 			/*
 			 * Promoting encrypted dataset outside its
 			 * encryption root.
 			 */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot promote dataset outside its "
 			    "encryption root"));
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		case EEXIST:
 			/* There is a conflicting snapshot name. */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "conflicting snapshot '%s' from parent '%s'"),
 			    snapname, zhp->zfs_dmustats.dds_origin);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, ret, errbuf));
 		}
 	}
 	return (ret);
 }
 
 typedef struct snapdata {
 	nvlist_t *sd_nvl;
 	const char *sd_snapname;
 } snapdata_t;
 
 static int
 zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
 {
 	snapdata_t *sd = arg;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int rv = 0;
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
 		if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp),
 		    sd->sd_snapname) >= sizeof (name))
 			return (EINVAL);
 
 		fnvlist_add_boolean(sd->sd_nvl, name);
 
 		rv = zfs_iter_filesystems_v2(zhp, 0, zfs_snapshot_cb, sd);
 	}
 	zfs_close(zhp);
 
 	return (rv);
 }
 
 /*
  * Creates snapshots.  The keys in the snaps nvlist are the snapshots to be
  * created.
  */
 int
 zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props)
 {
 	int ret;
 	char errbuf[ERRBUFLEN];
 	nvpair_t *elem;
 	nvlist_t *errors;
 	zpool_handle_t *zpool_hdl;
 	char pool[ZFS_MAX_DATASET_NAME_LEN];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create snapshots "));
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
 		const char *snapname = nvpair_name(elem);
 
 		/* validate the target name */
 		if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
 		    B_TRUE)) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot create snapshot '%s'"), snapname);
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	/*
 	 * get pool handle for prop validation. assumes all snaps are in the
 	 * same pool, as does lzc_snapshot (below).
 	 */
 	elem = nvlist_next_nvpair(snaps, NULL);
 	if (elem == NULL)
 		return (-1);
 	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 	pool[strcspn(pool, "/@")] = '\0';
 	zpool_hdl = zpool_open(hdl, pool);
 	if (zpool_hdl == NULL)
 		return (-1);
 
 	if (props != NULL &&
 	    (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
 	    props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) {
 		zpool_close(zpool_hdl);
 		return (-1);
 	}
 	zpool_close(zpool_hdl);
 
 	ret = lzc_snapshot(snaps, props, &errors);
 
 	if (ret != 0) {
 		boolean_t printed = B_FALSE;
 		for (elem = nvlist_next_nvpair(errors, NULL);
 		    elem != NULL;
 		    elem = nvlist_next_nvpair(errors, elem)) {
 			(void) snprintf(errbuf, sizeof (errbuf),
 			    dgettext(TEXT_DOMAIN,
 			    "cannot create snapshot '%s'"), nvpair_name(elem));
 			(void) zfs_standard_error(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 			printed = B_TRUE;
 		}
 		if (!printed) {
 			switch (ret) {
 			case EXDEV:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple snapshots of same "
 				    "fs not allowed"));
 				(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 
 				break;
 			default:
 				(void) zfs_standard_error(hdl, ret, errbuf);
 			}
 		}
 	}
 
 	nvlist_free(props);
 	nvlist_free(errors);
 	return (ret);
 }
 
 int
 zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
     nvlist_t *props)
 {
 	int ret;
 	snapdata_t sd = { 0 };
 	char fsname[ZFS_MAX_DATASET_NAME_LEN];
 	char *cp;
 	zfs_handle_t *zhp;
 	char errbuf[ERRBUFLEN];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot snapshot %s"), path);
 
 	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	(void) strlcpy(fsname, path, sizeof (fsname));
 	cp = strchr(fsname, '@');
 	*cp = '\0';
 	sd.sd_snapname = cp + 1;
 
 	if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME)) == NULL) {
 		return (-1);
 	}
 
 	sd.sd_nvl = fnvlist_alloc();
 	if (recursive) {
 		(void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
 	} else {
 		fnvlist_add_boolean(sd.sd_nvl, path);
 	}
 
 	ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
 	fnvlist_free(sd.sd_nvl);
 	zfs_close(zhp);
 	return (ret);
 }
 
 /*
  * Destroy any more recent snapshots.  We invoke this callback on any dependents
  * of the snapshot first.  If the 'cb_dependent' member is non-zero, then this
  * is a dependent and we should just destroy it without checking the transaction
  * group.
  */
 typedef struct rollback_data {
 	const char	*cb_target;		/* the snapshot */
 	uint64_t	cb_create;		/* creation time reference */
 	boolean_t	cb_error;
 	boolean_t	cb_force;
 } rollback_data_t;
 
 static int
 rollback_destroy_dependent(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 	prop_changelist_t *clp;
 
 	/* We must destroy this clone; first unmount it */
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    cbp->cb_force ? MS_FORCE: 0);
 	if (clp == NULL || changelist_prefix(clp) != 0) {
 		cbp->cb_error = B_TRUE;
 		zfs_close(zhp);
 		return (0);
 	}
 	if (zfs_destroy(zhp, B_FALSE) != 0)
 		cbp->cb_error = B_TRUE;
 	else
 		changelist_remove(clp, zhp->zfs_name);
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
 	zfs_close(zhp);
 	return (0);
 }
 
 static int
 rollback_destroy(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
 		cbp->cb_error |= zfs_iter_dependents_v2(zhp, 0, B_FALSE,
 		    rollback_destroy_dependent, cbp);
 
 		cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
 	}
 
 	zfs_close(zhp);
 	return (0);
 }
 
 /*
  * Given a dataset, rollback to a specific snapshot, discarding any
  * data changes since then and making it the active dataset.
  *
  * Any snapshots and bookmarks more recent than the target are
  * destroyed, along with their dependents (i.e. clones).
  */
 int
 zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 {
 	rollback_data_t cb = { 0 };
 	int err;
 	boolean_t restore_resv = 0;
 	uint64_t old_volsize = 0, new_volsize;
 	zfs_prop_t resv_prop = { 0 };
 	uint64_t min_txg = 0;
 
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    zhp->zfs_type == ZFS_TYPE_VOLUME);
 
 	/*
 	 * Destroy all recent snapshots and their dependents.
 	 */
 	cb.cb_force = force;
 	cb.cb_target = snap->zfs_name;
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 
 	if (cb.cb_create > 0)
 		min_txg = cb.cb_create;
 
 	(void) zfs_iter_snapshots_v2(zhp, 0, rollback_destroy, &cb,
 	    min_txg, 0);
 
 	(void) zfs_iter_bookmarks_v2(zhp, 0, rollback_destroy, &cb);
 
 	if (cb.cb_error)
 		return (-1);
 
 	/*
 	 * Now that we have verified that the snapshot is the latest,
 	 * rollback to the given snapshot.
 	 */
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
 		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 			return (-1);
 		old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 		restore_resv =
 		    (old_volsize == zfs_prop_get_int(zhp, resv_prop));
 	}
 
 	/*
 	 * Pass both the filesystem and the wanted snapshot names,
 	 * we would get an error back if the snapshot is destroyed or
 	 * a new snapshot is created before this request is processed.
 	 */
 	err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name);
 	if (err != 0) {
 		char errbuf[ERRBUFLEN];
 
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
 		    zhp->zfs_name);
 		switch (err) {
 		case EEXIST:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "there is a snapshot or bookmark more recent "
 			    "than '%s'"), snap->zfs_name);
 			(void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf);
 			break;
 		case ESRCH:
 			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
 			    "'%s' is not found among snapshots of '%s'"),
 			    snap->zfs_name, zhp->zfs_name);
 			(void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 		}
 		return (err);
 	}
 
 	/*
 	 * For volumes, if the pre-rollback volsize matched the pre-
 	 * rollback reservation and the volsize has changed then set
 	 * the reservation property to the post-rollback volsize.
 	 * Make a new handle since the rollback closed the dataset.
 	 */
 	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
 	    (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
 		if (restore_resv) {
 			new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 			if (old_volsize != new_volsize)
 				err = zfs_prop_set_int(zhp, resv_prop,
 				    new_volsize);
 		}
 		zfs_close(zhp);
 	}
 	return (err);
 }
 
 /*
  * Renames the given dataset.
  */
 int
 zfs_rename(zfs_handle_t *zhp, const char *target, renameflags_t flags)
 {
 	int ret = 0;
 	zfs_cmd_t zc = {"\0"};
 	char *delim;
 	prop_changelist_t *cl = NULL;
 	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	char property[ZFS_MAXPROPLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[ERRBUFLEN];
 
 	/* if we have the same exact name, just return success */
 	if (strcmp(zhp->zfs_name, target) == 0)
 		return (0);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot rename to '%s'"), target);
 
 	/* make sure source name is valid */
 	if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 	/*
 	 * Make sure the target name is valid
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
 		if ((strchr(target, '@') == NULL) ||
 		    *target == '@') {
 			/*
 			 * Snapshot target name is abbreviated,
 			 * reconstruct full dataset name
 			 */
 			(void) strlcpy(parent, zhp->zfs_name,
 			    sizeof (parent));
 			delim = strchr(parent, '@');
 			if (strchr(target, '@') == NULL)
 				*(++delim) = '\0';
 			else
 				*delim = '\0';
 			(void) strlcat(parent, target, sizeof (parent));
 			target = parent;
 		} else {
 			/*
 			 * Make sure we're renaming within the same dataset.
 			 */
 			delim = strchr(target, '@');
 			if (strncmp(zhp->zfs_name, target, delim - target)
 			    != 0 || zhp->zfs_name[delim - target] != '@') {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "snapshots must be part of same "
 				    "dataset"));
 				return (zfs_error(hdl, EZFS_CROSSTARGET,
 				    errbuf));
 			}
 		}
 
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
 		if (flags.recursive) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "recursive rename must be a snapshot"));
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 		}
 
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
 		/* validate parents */
 		if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
 			return (-1);
 
 		/* make sure we're in the same pool */
 		verify((delim = strchr(target, '/')) != NULL);
 		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
 		    zhp->zfs_name[delim - target] != '/') {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "datasets must be within same pool"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 		}
 
 		/* new name cannot be a child of the current dataset name */
 		if (is_descendant(zhp->zfs_name, target)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "New dataset name cannot be a descendant of "
 			    "current dataset name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
 	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name);
 
 	if (getzoneid() == GLOBAL_ZONEID &&
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset is used in a non-global zone"));
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
 	/*
 	 * Avoid unmounting file systems with mountpoint property set to
 	 * 'legacy' or 'none' even if -u option is not given.
 	 */
 	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 	    !flags.recursive && !flags.nounmount &&
 	    zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
 	    sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
 	    (strcmp(property, "legacy") == 0 ||
 	    strcmp(property, "none") == 0)) {
 		flags.nounmount = B_TRUE;
 	}
 	if (flags.recursive) {
 		char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		delim = strchr(parentname, '@');
 		*delim = '\0';
 		zfs_handle_t *zhrp = zfs_open(zhp->zfs_hdl, parentname,
 		    ZFS_TYPE_DATASET);
 		free(parentname);
 		if (zhrp == NULL) {
 			ret = -1;
 			goto error;
 		}
 		zfs_close(zhrp);
 	} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
 		    flags.nounmount ? CL_GATHER_DONT_UNMOUNT :
 		    CL_GATHER_ITER_MOUNTED,
 		    flags.forceunmount ? MS_FORCE : 0)) == NULL)
 			return (-1);
 
 		if (changelist_haszonedchild(cl)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "child dataset with inherited mountpoint is used "
 			    "in a non-global zone"));
 			(void) zfs_error(hdl, EZFS_ZONED, errbuf);
 			ret = -1;
 			goto error;
 		}
 
 		if ((ret = changelist_prefix(cl)) != 0)
 			goto error;
 	}
 
 	if (ZFS_IS_VOLUME(zhp))
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
 	zc.zc_cookie = !!flags.recursive;
 	zc.zc_cookie |= (!!flags.nounmount) << 1;
 
 	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
 		/*
 		 * if it was recursive, the one that actually failed will
 		 * be in zc.zc_name
 		 */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot rename '%s'"), zc.zc_name);
 
 		if (flags.recursive && errno == EEXIST) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "a child dataset already has a snapshot "
 			    "with the new name"));
 			(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 		} else if (errno == EACCES) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "cannot move encrypted child outside of "
 			    "its encryption root"));
 			(void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf);
 		} else {
 			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
 		}
 
 		/*
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
 		if (cl != NULL)
 			(void) changelist_postfix(cl);
 	} else {
 		if (cl != NULL) {
 			changelist_rename(cl, zfs_get_name(zhp), target);
 			ret = changelist_postfix(cl);
 		}
 		(void) strlcpy(zhp->zfs_name, target, sizeof (zhp->zfs_name));
 	}
 
 error:
 	if (cl != NULL) {
 		changelist_free(cl);
 	}
 	return (ret);
 }
 
 nvlist_t *
 zfs_get_all_props(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_props);
 }
 
 nvlist_t *
 zfs_get_recvd_props(zfs_handle_t *zhp)
 {
 	if (zhp->zfs_recvd_props == NULL)
 		if (get_recvd_props_ioctl(zhp) != 0)
 			return (NULL);
 	return (zhp->zfs_recvd_props);
 }
 
 nvlist_t *
 zfs_get_user_props(zfs_handle_t *zhp)
 {
 	return (zhp->zfs_user_props);
 }
 
 /*
  * This function is used by 'zfs list' to determine the exact set of columns to
  * display, and their maximum widths.  This does two main things:
  *
  *      - If this is a list of all properties, then expand the list to include
  *        all native properties, and set a flag so that for each dataset we look
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
  *        so that we can size the column appropriately. If the user has
  *        requested received property values, we also need to compute the width
  *        of the RECEIVED column.
  */
 int
 zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received,
     boolean_t literal)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zprop_list_t *entry;
 	zprop_list_t **last, **start;
 	nvlist_t *userprops, *propval;
 	nvpair_t *elem;
 	const char *strval;
 	char buf[ZFS_MAXPROPLEN];
 
 	if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0)
 		return (-1);
 
 	userprops = zfs_get_user_props(zhp);
 
 	entry = *plp;
 	if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) {
 		/*
 		 * Go through and add any user properties as necessary.  We
 		 * start by incrementing our list pointer to the first
 		 * non-native property.
 		 */
 		start = plp;
 		while (*start != NULL) {
 			if ((*start)->pl_prop == ZPROP_USERPROP)
 				break;
 			start = &(*start)->pl_next;
 		}
 
 		elem = NULL;
 		while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) {
 			/*
 			 * See if we've already found this property in our list.
 			 */
 			for (last = start; *last != NULL;
 			    last = &(*last)->pl_next) {
 				if (strcmp((*last)->pl_user_prop,
 				    nvpair_name(elem)) == 0)
 					break;
 			}
 
 			if (*last == NULL) {
 				entry = zfs_alloc(hdl, sizeof (zprop_list_t));
 				entry->pl_user_prop =
 				    zfs_strdup(hdl, nvpair_name(elem));
 				entry->pl_prop = ZPROP_USERPROP;
 				entry->pl_width = strlen(nvpair_name(elem));
 				entry->pl_all = B_TRUE;
 				*last = entry;
 			}
 		}
 	}
 
 	/*
 	 * Now go through and check the width of any non-fixed columns
 	 */
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 		if (entry->pl_fixed && !literal)
 			continue;
 
 		if (entry->pl_prop != ZPROP_USERPROP) {
 			if (zfs_prop_get(zhp, entry->pl_prop,
 			    buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    zfs_prop_to_name(entry->pl_prop),
 			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		} else {
 			if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
 			    &propval) == 0) {
 				strval = fnvlist_lookup_string(propval,
 				    ZPROP_VALUE);
 				if (strlen(strval) > entry->pl_width)
 					entry->pl_width = strlen(strval);
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    entry->pl_user_prop,
 			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		}
 	}
 
 	return (0);
 }
 
 void
 zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
 {
 	nvpair_t *curr;
 	nvpair_t *next;
 
 	/*
 	 * Keep a reference to the props-table against which we prune the
 	 * properties.
 	 */
 	zhp->zfs_props_table = props;
 
 	curr = nvlist_next_nvpair(zhp->zfs_props, NULL);
 
 	while (curr) {
 		zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
 		next = nvlist_next_nvpair(zhp->zfs_props, curr);
 
 		/*
 		 * User properties will result in ZPROP_USERPROP (an alias
 		 * for ZPROP_INVAL), and since we
 		 * only know how to prune standard ZFS properties, we always
 		 * leave these in the list.  This can also happen if we
 		 * encounter an unknown DSL property (when running older
 		 * software, for example).
 		 */
 		if (zfs_prop != ZPROP_USERPROP && props[zfs_prop] == B_FALSE)
 			(void) nvlist_remove(zhp->zfs_props,
 			    nvpair_name(curr), nvpair_type(curr));
 		curr = next;
 	}
 }
 
 static int
 zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
     zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
 {
 	zfs_cmd_t zc = {"\0"};
 	nvlist_t *nvlist = NULL;
 	int error;
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
 	zc.zc_cookie = (uint64_t)cmd;
 
 	if (cmd == ZFS_SMB_ACL_RENAME) {
 		if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
 			(void) no_memory(hdl);
 			return (0);
 		}
 	}
 
 	switch (cmd) {
 	case ZFS_SMB_ACL_ADD:
 	case ZFS_SMB_ACL_REMOVE:
 		(void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
 		break;
 	case ZFS_SMB_ACL_RENAME:
 		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
 		    resource1) != 0) {
 				(void) no_memory(hdl);
 				return (-1);
 		}
 		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
 		    resource2) != 0) {
 				(void) no_memory(hdl);
 				return (-1);
 		}
 		zcmd_write_src_nvlist(hdl, &zc, nvlist);
 		break;
 	case ZFS_SMB_ACL_PURGE:
 		break;
 	default:
 		return (-1);
 	}
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
 	nvlist_free(nvlist);
 	return (error);
 }
 
 int
 zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset,
     char *path, char *resource)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
 	    resource, NULL));
 }
 
 int
 zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset,
     char *path, char *resource)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
 	    resource, NULL));
 }
 
 int
 zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
 	    NULL, NULL));
 }
 
 int
 zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path,
     char *oldname, char *newname)
 {
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
 	    oldname, newname));
 }
 
 int
 zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
     zfs_userspace_cb_t func, void *arg)
 {
 	zfs_cmd_t zc = {"\0"};
 	zfs_useracct_t buf[100];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	int ret;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	zc.zc_objset_type = type;
 	zc.zc_nvlist_dst = (uintptr_t)buf;
 
 	for (;;) {
 		zfs_useracct_t *zua = buf;
 
 		zc.zc_nvlist_dst_size = sizeof (buf);
 		if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
 			if ((errno == ENOTSUP &&
 			    (type == ZFS_PROP_USEROBJUSED ||
 			    type == ZFS_PROP_GROUPOBJUSED ||
 			    type == ZFS_PROP_USEROBJQUOTA ||
 			    type == ZFS_PROP_GROUPOBJQUOTA ||
 			    type == ZFS_PROP_PROJECTOBJUSED ||
 			    type == ZFS_PROP_PROJECTOBJQUOTA ||
 			    type == ZFS_PROP_PROJECTUSED ||
 			    type == ZFS_PROP_PROJECTQUOTA)))
 				break;
 
 			return (zfs_standard_error_fmt(hdl, errno,
 			    dgettext(TEXT_DOMAIN,
 			    "cannot get used/quota for %s"), zc.zc_name));
 		}
 		if (zc.zc_nvlist_dst_size == 0)
 			break;
 
 		while (zc.zc_nvlist_dst_size > 0) {
 			if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
 			    zua->zu_space, zc.zc_guid)) != 0)
 				return (ret);
 			zua++;
 			zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
 		}
 	}
 
 	return (0);
 }
 
 struct holdarg {
 	nvlist_t *nvl;
 	const char *snapname;
 	const char *tag;
 	boolean_t recursive;
 	int error;
 };
 
 static int
 zfs_hold_one(zfs_handle_t *zhp, void *arg)
 {
 	struct holdarg *ha = arg;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int rv = 0;
 
 	if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name,
 	    ha->snapname) >= sizeof (name))
 		return (EINVAL);
 
 	if (lzc_exists(name))
 		fnvlist_add_string(ha->nvl, name, ha->tag);
 
 	if (ha->recursive)
 		rv = zfs_iter_filesystems_v2(zhp, 0, zfs_hold_one, ha);
 	zfs_close(zhp);
 	return (rv);
 }
 
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive, int cleanup_fd)
 {
 	int ret;
 	struct holdarg ha;
 
 	ha.nvl = fnvlist_alloc();
 	ha.snapname = snapname;
 	ha.tag = tag;
 	ha.recursive = recursive;
 	(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
 
 	if (nvlist_empty(ha.nvl)) {
 		char errbuf[ERRBUFLEN];
 
 		fnvlist_free(ha.nvl);
 		ret = ENOENT;
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot hold snapshot '%s@%s'"),
 		    zhp->zfs_name, snapname);
 		(void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
 		return (ret);
 	}
 
 	ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
 	fnvlist_free(ha.nvl);
 
 	return (ret);
 }
 
 int
 zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds)
 {
 	int ret;
 	nvlist_t *errors;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[ERRBUFLEN];
 	nvpair_t *elem;
 
 	errors = NULL;
 	ret = lzc_hold(holds, cleanup_fd, &errors);
 
 	if (ret == 0) {
 		/* There may be errors even in the success case. */
 		fnvlist_free(errors);
 		return (0);
 	}
 
 	if (nvlist_empty(errors)) {
 		/* no hold-specific errors */
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot hold"));
 		switch (ret) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, ret, errbuf);
 		}
 	}
 
 	for (elem = nvlist_next_nvpair(errors, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(errors, elem)) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot hold snapshot '%s'"), nvpair_name(elem));
 		switch (fnvpair_value_int32(elem)) {
 		case E2BIG:
 			/*
 			 * Temporary tags wind up having the ds object id
 			 * prepended. So even if we passed the length check
 			 * above, it's still possible for the tag to wind
 			 * up being slightly too long.
 			 */
 			(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case EEXIST:
 			(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
 	fnvlist_free(errors);
 	return (ret);
 }
 
 static int
 zfs_release_one(zfs_handle_t *zhp, void *arg)
 {
 	struct holdarg *ha = arg;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int rv = 0;
 	nvlist_t *existing_holds;
 
 	if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name,
 	    ha->snapname) >= sizeof (name)) {
 		ha->error = EINVAL;
 		rv = EINVAL;
 	}
 
 	if (lzc_get_holds(name, &existing_holds) != 0) {
 		ha->error = ENOENT;
 	} else if (!nvlist_exists(existing_holds, ha->tag)) {
 		ha->error = ESRCH;
 	} else {
 		nvlist_t *torelease = fnvlist_alloc();
 		fnvlist_add_boolean(torelease, ha->tag);
 		fnvlist_add_nvlist(ha->nvl, name, torelease);
 		fnvlist_free(torelease);
 	}
 
 	if (ha->recursive)
 		rv = zfs_iter_filesystems_v2(zhp, 0, zfs_release_one, ha);
 	zfs_close(zhp);
 	return (rv);
 }
 
 int
 zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
     boolean_t recursive)
 {
 	int ret;
 	struct holdarg ha;
 	nvlist_t *errors = NULL;
 	nvpair_t *elem;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[ERRBUFLEN];
 
 	ha.nvl = fnvlist_alloc();
 	ha.snapname = snapname;
 	ha.tag = tag;
 	ha.recursive = recursive;
 	ha.error = 0;
 	(void) zfs_release_one(zfs_handle_dup(zhp), &ha);
 
 	if (nvlist_empty(ha.nvl)) {
 		fnvlist_free(ha.nvl);
 		ret = ha.error;
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot release hold from snapshot '%s@%s'"),
 		    zhp->zfs_name, snapname);
 		if (ret == ESRCH) {
 			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, ret, errbuf);
 		}
 		return (ret);
 	}
 
 	ret = lzc_release(ha.nvl, &errors);
 	fnvlist_free(ha.nvl);
 
 	if (ret == 0) {
 		/* There may be errors even in the success case. */
 		fnvlist_free(errors);
 		return (0);
 	}
 
 	if (nvlist_empty(errors)) {
 		/* no hold-specific errors */
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot release"));
 		switch (errno) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl, errno, errbuf);
 		}
 	}
 
 	for (elem = nvlist_next_nvpair(errors, NULL);
 	    elem != NULL;
 	    elem = nvlist_next_nvpair(errors, elem)) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN,
 		    "cannot release hold from snapshot '%s'"),
 		    nvpair_name(elem));
 		switch (fnvpair_value_int32(elem)) {
 		case ESRCH:
 			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
 			break;
 		case EINVAL:
 			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		default:
 			(void) zfs_standard_error(hdl,
 			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
 	fnvlist_free(errors);
 	return (ret);
 }
 
 int
 zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
 {
 	zfs_cmd_t zc = {"\0"};
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	int nvsz = 2048;
 	void *nvbuf;
 	int err = 0;
 	char errbuf[ERRBUFLEN];
 
 	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
 	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 tryagain:
 
 	nvbuf = malloc(nvsz);
 	if (nvbuf == NULL) {
 		err = (zfs_error(hdl, EZFS_NOMEM, zfs_strerror(errno)));
 		goto out;
 	}
 
 	zc.zc_nvlist_dst_size = nvsz;
 	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
 		    zc.zc_name);
 		switch (errno) {
 		case ENOMEM:
 			free(nvbuf);
 			nvsz = zc.zc_nvlist_dst_size;
 			goto tryagain;
 
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error(hdl, errno, errbuf);
 			break;
 		}
 	} else {
 		/* success */
 		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
 		if (rc) {
 			err = zfs_standard_error_fmt(hdl, rc, dgettext(
 			    TEXT_DOMAIN, "cannot get permissions on '%s'"),
 			    zc.zc_name);
 		}
 	}
 
 	free(nvbuf);
 out:
 	return (err);
 }
 
 int
 zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
 {
 	zfs_cmd_t zc = {"\0"};
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char *nvbuf;
 	char errbuf[ERRBUFLEN];
 	size_t nvsz;
 	int err;
 
 	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
 	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
 	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
 	assert(err == 0);
 
 	nvbuf = malloc(nvsz);
 
 	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
 	assert(err == 0);
 
 	zc.zc_nvlist_src_size = nvsz;
 	zc.zc_nvlist_src = (uintptr_t)nvbuf;
 	zc.zc_perm_action = un;
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
 		    zc.zc_name);
 		switch (errno) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	free(nvbuf);
 
 	return (err);
 }
 
 int
 zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 {
 	int err;
 	char errbuf[ERRBUFLEN];
 
 	err = lzc_get_holds(zhp->zfs_name, nvl);
 
 	if (err != 0) {
 		libzfs_handle_t *hdl = zhp->zfs_hdl;
 
 		(void) snprintf(errbuf, sizeof (errbuf),
 		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
 		    zhp->zfs_name);
 		switch (err) {
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
 			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
 			break;
 		case EINVAL:
 			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
 			break;
 		case ENOENT:
 			err = zfs_error(hdl, EZFS_NOENT, errbuf);
 			break;
 		default:
 			err = zfs_standard_error(hdl, errno, errbuf);
 			break;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * The theory of raidz space accounting
  *
  * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
  * will "reference" 128KB, even though it allocates more than that, to store the
  * parity information (and perhaps skip sectors). This concept of the
  * "referenced" (and other DMU space accounting) being lower than the allocated
  * space by a constant factor is called "raidz deflation."
  *
  * As mentioned above, the constant factor for raidz deflation assumes a 128KB
  * block size. However, zvols typically have a much smaller block size (default
  * 8KB). These smaller blocks may require proportionally much more parity
  * information (and perhaps skip sectors). In this case, the change to the
  * "referenced" property may be much more than the logical block size.
  *
  * Suppose a raidz vdev has 5 disks with ashift=12.  A 128k block may be written
  * as follows.
  *
  * +-------+-------+-------+-------+-------+
  * | disk1 | disk2 | disk3 | disk4 | disk5 |
  * +-------+-------+-------+-------+-------+
  * |  P0   |  D0   |  D8   |  D16  |  D24  |
  * |  P1   |  D1   |  D9   |  D17  |  D25  |
  * |  P2   |  D2   |  D10  |  D18  |  D26  |
  * |  P3   |  D3   |  D11  |  D19  |  D27  |
  * |  P4   |  D4   |  D12  |  D20  |  D28  |
  * |  P5   |  D5   |  D13  |  D21  |  D29  |
  * |  P6   |  D6   |  D14  |  D22  |  D30  |
  * |  P7   |  D7   |  D15  |  D23  |  D31  |
  * +-------+-------+-------+-------+-------+
  *
  * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
  * sectors.  The dataset's referenced will increase by 128k and the pool's
  * allocated and free properties will be adjusted by 160k.
  *
  * A 4k block written to the same raidz vdev will require two 4k sectors.  The
  * blank cells represent unallocated space.
  *
  * +-------+-------+-------+-------+-------+
  * | disk1 | disk2 | disk3 | disk4 | disk5 |
  * +-------+-------+-------+-------+-------+
  * |  P0   |  D0   |       |       |       |
  * +-------+-------+-------+-------+-------+
  *
  * Above, notice that the 4k block required one sector for parity and another
- * for data.  vdev_raidz_asize() will return 8k and as such the pool's allocated
- * and free properties will be adjusted by 8k.  The dataset will not be charged
- * 8k.  Rather, it will be charged a value that is scaled according to the
- * overhead of the 128k block on the same vdev.  This 8k allocation will be
- * charged 8k * 128k / 160k.  128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
- * calculated in the 128k block example above.
+ * for data.  vdev_raidz_psize_to_asize() will return 8k and as such the pool's
+ * allocated and free properties will be adjusted by 8k.  The dataset will not
+ * be charged 8k.  Rather, it will be charged a value that is scaled according
+ * to the overhead of the 128k block on the same vdev.  This 8k allocation will
+ * be charged 8k * 128k / 160k.  128k is from SPA_OLD_MAXBLOCKSIZE and 160k is
+ * as calculated in the 128k block example above.
  *
  * Every raidz allocation is sized to be a multiple of nparity+1 sectors.  That
  * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
  * allocations are a multiple of 3 sectors, and raidz3 allocations are a
  * multiple of of 4 sectors.  When a block does not fill the required number of
  * sectors, skip blocks (sectors) are used.
  *
  * An 8k block being written to a raidz vdev may be written as follows:
  *
  * +-------+-------+-------+-------+-------+
  * | disk1 | disk2 | disk3 | disk4 | disk5 |
  * +-------+-------+-------+-------+-------+
  * |  P0   |  D0   |  D1   |  S0   |       |
  * +-------+-------+-------+-------+-------+
  *
  * In order to maintain the nparity+1 allocation size, a skip block (S0) was
  * added.  For this 8k block, the pool's allocated and free properties are
  * adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
  * 160k.  Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
  * the 128k block example above.
  *
  * The situation is slightly different for dRAID since the minimum allocation
  * size is the full group width.  The same 8K block above would be written as
  * follows in a dRAID group:
  *
  * +-------+-------+-------+-------+-------+
  * | disk1 | disk2 | disk3 | disk4 | disk5 |
  * +-------+-------+-------+-------+-------+
  * |  P0   |  D0   |  D1   |  S0   |  S1   |
  * +-------+-------+-------+-------+-------+
  *
  * Compression may lead to a variety of block sizes being written for the same
  * volume or file.  There is no clear way to reserve just the amount of space
  * that will be required, so the worst case (no compression) is assumed.
  * Note that metadata blocks will typically be compressed, so the reservation
  * size returned by zvol_volsize_to_reservation() will generally be slightly
  * larger than the maximum that the volume can reference.
  */
 
 /*
  * Derived from function of same name in module/zfs/vdev_raidz.c.  Returns the
  * amount of space (in bytes) that will be allocated for the specified block
  * size. Note that the "referenced" space accounted will be less than this, but
  * not necessarily equal to "blksize", due to RAIDZ deflation.
  */
 static uint64_t
-vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
+vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
     uint64_t blksize)
 {
 	uint64_t asize, ndata;
 
 	ASSERT3U(ndisks, >, nparity);
 	ndata = ndisks - nparity;
 	asize = ((blksize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + ndata - 1) / ndata);
 	asize = roundup(asize, nparity + 1) << ashift;
 
 	return (asize);
 }
 
 /*
  * Derived from function of same name in module/zfs/vdev_draid.c.  Returns the
  * amount of space (in bytes) that will be allocated for the specified block
  * size.
  */
 static uint64_t
-vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
+vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
     uint64_t blksize)
 {
 	ASSERT3U(ndisks, >, nparity);
 	uint64_t ndata = ndisks - nparity;
 	uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1;
 	uint64_t asize = (rows * ndisks) << ashift;
 
 	return (asize);
 }
 
 /*
  * Determine how much space will be allocated if it lands on the most space-
  * inefficient top-level vdev.  Returns the size in bytes required to store one
  * copy of the volume data.  See theory comment above.
  */
 static uint64_t
 volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
 {
 	nvlist_t *config, *tree, **vdevs;
 	uint_t nvdevs;
 	uint64_t ret = 0;
 
 	config = zpool_get_config(zhp, NULL);
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
 	    nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
 	    &vdevs, &nvdevs) != 0) {
 		return (nblocks * blksize);
 	}
 
 	for (int v = 0; v < nvdevs; v++) {
 		const char *type;
 		uint64_t nparity, ashift, asize, tsize;
 		uint64_t volsize;
 
 		if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
 		    &type) != 0)
 			continue;
 
 		if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 &&
 		    strcmp(type, VDEV_TYPE_DRAID) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(vdevs[v],
 		    ZPOOL_CONFIG_NPARITY, &nparity) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(vdevs[v],
 		    ZPOOL_CONFIG_ASHIFT, &ashift) != 0)
 			continue;
 
 		if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
 			nvlist_t **disks;
 			uint_t ndisks;
 
 			if (nvlist_lookup_nvlist_array(vdevs[v],
 			    ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0)
 				continue;
 
 			/* allocation size for the "typical" 128k block */
-			tsize = vdev_raidz_asize(ndisks, nparity, ashift,
-			    SPA_OLD_MAXBLOCKSIZE);
+			tsize = vdev_raidz_psize_to_asize(ndisks, nparity,
+			    ashift, SPA_OLD_MAXBLOCKSIZE);
 
 			/* allocation size for the blksize block */
-			asize = vdev_raidz_asize(ndisks, nparity, ashift,
-			    blksize);
+			asize = vdev_raidz_psize_to_asize(ndisks, nparity,
+			    ashift, blksize);
 		} else {
 			uint64_t ndata;
 
 			if (nvlist_lookup_uint64(vdevs[v],
 			    ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0)
 				continue;
 
 			/* allocation size for the "typical" 128k block */
-			tsize = vdev_draid_asize(ndata + nparity, nparity,
-			    ashift, SPA_OLD_MAXBLOCKSIZE);
+			tsize = vdev_draid_psize_to_asize(ndata + nparity,
+			    nparity, ashift, SPA_OLD_MAXBLOCKSIZE);
 
 			/* allocation size for the blksize block */
-			asize = vdev_draid_asize(ndata + nparity, nparity,
-			    ashift, blksize);
+			asize = vdev_draid_psize_to_asize(ndata + nparity,
+			    nparity, ashift, blksize);
 		}
 
 		/*
 		 * Scale this size down as a ratio of 128k / tsize.
 		 * See theory statement above.
 		 *
 		 * Bitshift is to avoid the case of nblocks * asize < tsize
 		 * producing a size of 0.
 		 */
 		volsize = (nblocks * asize) / (tsize >> SPA_MINBLOCKSHIFT);
 		/*
 		 * If we would blow UINT64_MAX with this next multiplication,
 		 * don't.
 		 */
 		if (volsize >
 		    (UINT64_MAX / (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT)))
 			volsize = UINT64_MAX;
 		else
 			volsize *= (SPA_OLD_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 		if (volsize > ret) {
 			ret = volsize;
 		}
 	}
 
 	if (ret == 0) {
 		ret = nblocks * blksize;
 	}
 
 	return (ret);
 }
 
 /*
  * Convert the zvol's volume size to an appropriate reservation.  See theory
  * comment above.
  *
  * Note: If this routine is updated, it is necessary to update the ZFS test
  * suite's shell version in reservation.shlib.
  */
 uint64_t
 zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
     nvlist_t *props)
 {
 	uint64_t numdb;
 	uint64_t nblocks, volblocksize;
 	int ncopies;
 	const char *strval;
 
 	if (nvlist_lookup_string(props,
 	    zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
 		ncopies = atoi(strval);
 	else
 		ncopies = 1;
 	if (nvlist_lookup_uint64(props,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 	    &volblocksize) != 0)
 		volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
 
 	nblocks = volsize / volblocksize;
 	/*
 	 * Metadata defaults to using 128k blocks, not volblocksize blocks.  For
 	 * this reason, only the data blocks are scaled based on vdev config.
 	 */
 	volsize = volsize_from_vdevs(zph, nblocks, volblocksize);
 
 	/* start with metadnode L0-L6 */
 	numdb = 7;
 	/* calculate number of indirects */
 	while (nblocks > 1) {
 		nblocks += DNODES_PER_LEVEL - 1;
 		nblocks /= DNODES_PER_LEVEL;
 		numdb += nblocks;
 	}
 	numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
 	volsize *= ncopies;
 	/*
 	 * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
 	 * compressed, but in practice they compress down to about
 	 * 1100 bytes
 	 */
 	numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
 	volsize += numdb;
 	return (volsize);
 }
 
 /*
  * Wait for the given activity and return the status of the wait (whether or not
  * any waiting was done) in the 'waited' parameter. Non-existent fses are
  * reported via the 'missing' parameter, rather than by printing an error
  * message. This is convenient when this function is called in a loop over a
  * long period of time (as it is, for example, by zfs's wait cmd). In that
  * scenario, a fs being exported or destroyed should be considered a normal
  * event, so we don't want to print an error when we find that the fs doesn't
  * exist.
  */
 int
 zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity,
     boolean_t *missing, boolean_t *waited)
 {
 	int error = lzc_wait_fs(zhp->zfs_name, activity, waited);
 	*missing = (error == ENOENT);
 	if (*missing)
 		return (0);
 
 	if (error != 0) {
 		(void) zfs_standard_error_fmt(zhp->zfs_hdl, error,
 		    dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"),
 		    zhp->zfs_name);
 	}
 
 	return (error);
 }
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 7acf37ba9cd7..b75d1ccea685 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1,1297 +1,1298 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
  */
 
 #include <sys/zfs_context.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_os.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <vm/vm_page.h>
 #include <geom/geom.h>
 #include <geom/geom_disk.h>
 #include <geom/geom_int.h>
 
 #ifndef g_topology_locked
 #define	g_topology_locked()	sx_xlocked(&topology_lock)
 #endif
 
 /*
  * Virtual device vector for GEOM.
  */
 
 static g_attrchanged_t vdev_geom_attrchanged;
 struct g_class zfs_vdev_class = {
 	.name = "ZFS::VDEV",
 	.version = G_VERSION,
 	.attrchanged = vdev_geom_attrchanged,
 };
 
 struct consumer_vdev_elem {
 	SLIST_ENTRY(consumer_vdev_elem)	elems;
 	vdev_t	*vd;
 };
 
 SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
 _Static_assert(
     sizeof (((struct g_consumer *)NULL)->private) ==
     sizeof (struct consumer_priv_t *),
 	"consumer_priv_t* can't be stored in g_consumer.private");
 
 DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
 
 SYSCTL_DECL(_vfs_zfs_vdev);
 /* Don't send BIO_FLUSH. */
 static int vdev_geom_bio_flush_disable;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
 	&vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
 /* Don't send BIO_DELETE. */
 static int vdev_geom_bio_delete_disable;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
 	&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
 
 /* Declare local functions */
 static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
 
 /*
  * Thread local storage used to indicate when a thread is probing geoms
  * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
  * it is looking for a replacement for the vdev_t* that is its value.
  */
 uint_t zfs_geom_probe_vdev_key;
 
 static void
 vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
     boolean_t do_null_update)
 {
 	boolean_t needs_update = B_FALSE;
 	char *physpath;
 	int error, physpath_len;
 
 	physpath_len = MAXPATHLEN;
 	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
 	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
 	if (error == 0) {
 		char *old_physpath;
 
 		/* g_topology lock ensures that vdev has not been closed */
 		g_topology_assert();
 		old_physpath = vd->vdev_physpath;
 		vd->vdev_physpath = spa_strdup(physpath);
 
 		if (old_physpath != NULL) {
 			needs_update = (strcmp(old_physpath,
 			    vd->vdev_physpath) != 0);
 			spa_strfree(old_physpath);
 		} else
 			needs_update = do_null_update;
 	}
 	g_free(physpath);
 
 	/*
 	 * If the physical path changed, update the config.
 	 * Only request an update for previously unset physpaths if
 	 * requested by the caller.
 	 */
 	if (needs_update)
 		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
 
 }
 
 static void
 vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
 {
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem;
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		return;
 
 	SLIST_FOREACH(elem, priv, elems) {
 		vdev_t *vd = elem->vd;
 		if (strcmp(attr, "GEOM::physpath") == 0) {
 			vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
 			return;
 		}
 	}
 }
 
 static void
 vdev_geom_resize(struct g_consumer *cp)
 {
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem;
 	spa_t *spa;
 	vdev_t *vd;
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		return;
 
 	SLIST_FOREACH(elem, priv, elems) {
 		vd = elem->vd;
 		if (vd->vdev_state != VDEV_STATE_HEALTHY)
 			continue;
 		spa = vd->vdev_spa;
 		if (!spa->spa_autoexpand)
 			continue;
 		vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
 	}
 }
 
 static void
 vdev_geom_orphan(struct g_consumer *cp)
 {
 	struct consumer_priv_t *priv;
 	// cppcheck-suppress uninitvar
 	struct consumer_vdev_elem *elem;
 
 	g_topology_assert();
 
 	priv = (struct consumer_priv_t *)&cp->private;
 	if (SLIST_EMPTY(priv))
 		/* Vdev close in progress.  Ignore the event. */
 		return;
 
 	/*
 	 * Orphan callbacks occur from the GEOM event thread.
 	 * Concurrent with this call, new I/O requests may be
 	 * working their way through GEOM about to find out
 	 * (only once executed by the g_down thread) that we've
 	 * been orphaned from our disk provider.  These I/Os
 	 * must be retired before we can detach our consumer.
 	 * This is most easily achieved by acquiring the
 	 * SPA ZIO configuration lock as a writer, but doing
 	 * so with the GEOM topology lock held would cause
 	 * a lock order reversal.  Instead, rely on the SPA's
 	 * async removal support to invoke a close on this
 	 * vdev once it is safe to do so.
 	 */
 	SLIST_FOREACH(elem, priv, elems) {
 		// cppcheck-suppress uninitvar
 		vdev_t *vd = elem->vd;
 
 		vd->vdev_remove_wanted = B_TRUE;
 		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
 	}
 }
 
 static struct g_consumer *
 vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Attaching to %s.", pp->name);
 
 	if (sanity) {
 		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
 			ZFS_LOG(1, "Failing attach of %s. "
 			    "Incompatible sectorsize %d\n",
 			    pp->name, pp->sectorsize);
 			return (NULL);
 		} else if (pp->mediasize < SPA_MINDEVSIZE) {
 			ZFS_LOG(1, "Failing attach of %s. "
 			    "Incompatible mediasize %ju\n",
 			    pp->name, pp->mediasize);
 			return (NULL);
 		}
 	}
 
 	/* Do we have geom already? No? Create one. */
 	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
 		if (gp->flags & G_GEOM_WITHER)
 			continue;
 		if (strcmp(gp->name, "zfs::vdev") != 0)
 			continue;
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
 		gp->orphan = vdev_geom_orphan;
 		gp->attrchanged = vdev_geom_attrchanged;
 		gp->resize = vdev_geom_resize;
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		if (error != 0) {
 			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
 			    __LINE__, error);
 			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
 		error = g_access(cp, 1, 0, 1);
 		if (error != 0) {
 			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
 			    __LINE__, error);
 			vdev_geom_detach(cp, B_FALSE);
 			return (NULL);
 		}
 		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
 	} else {
 		/* Check if we are already connected to this provider. */
 		LIST_FOREACH(cp, &gp->consumer, consumer) {
 			if (cp->provider == pp) {
 				ZFS_LOG(1, "Found consumer for %s.", pp->name);
 				break;
 			}
 		}
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
 			error = g_attach(cp, pp);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
 				    __func__, __LINE__, error);
 				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
 			error = g_access(cp, 1, 0, 1);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
 				    __func__, __LINE__, error);
 				vdev_geom_detach(cp, B_FALSE);
 				return (NULL);
 			}
 			ZFS_LOG(1, "Created consumer for %s.", pp->name);
 		} else {
 			error = g_access(cp, 1, 0, 1);
 			if (error != 0) {
 				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
 				    __func__, __LINE__, error);
 				return (NULL);
 			}
 			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
 		}
 	}
 
 	if (vd != NULL)
 		vd->vdev_tsd = cp;
 
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	return (cp);
 }
 
 static void
 vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
 {
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Detaching from %s.",
 	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
 
 	gp = cp->geom;
 	if (open_for_read)
 		g_access(cp, -1, 0, -1);
 	/* Destroy consumer on last close. */
 	if (cp->acr == 0 && cp->ace == 0) {
 		if (cp->acw > 0)
 			g_access(cp, 0, -cp->acw, 0);
 		if (cp->provider != NULL) {
 			ZFS_LOG(1, "Destroying consumer for %s.",
 			    cp->provider->name ? cp->provider->name : "NULL");
 			g_detach(cp);
 		}
 		g_destroy_consumer(cp);
 	}
 	/* Destroy geom if there are no consumers left. */
 	if (LIST_EMPTY(&gp->consumer)) {
 		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
 		g_wither_geom(gp, ENXIO);
 	}
 }
 
 static void
 vdev_geom_close_locked(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	struct consumer_priv_t *priv;
 	struct consumer_vdev_elem *elem, *elem_temp;
 
 	g_topology_assert();
 
 	cp = vd->vdev_tsd;
 	vd->vdev_delayed_close = B_FALSE;
 	if (cp == NULL)
 		return;
 
 	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
 	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
 	priv = (struct consumer_priv_t *)&cp->private;
 	vd->vdev_tsd = NULL;
 	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
 		if (elem->vd == vd) {
 			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
 			g_free(elem);
 		}
 	}
 
 	vdev_geom_detach(cp, B_TRUE);
 }
 
 /*
  * Issue one or more bios to the vdev in parallel
  * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
  * operation is described by parallel entries from each array.  There may be
  * more bios actually issued than entries in the array
  */
 static void
 vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
     off_t *sizes, int *errors, int ncmds)
 {
 	struct bio **bios;
 	uint8_t *p;
 	off_t off, maxio, s, end;
 	int i, n_bios, j;
 	size_t bios_size;
 
 	maxio = maxphys - (maxphys % cp->provider->sectorsize);
 	n_bios = 0;
 
 	/* How many bios are required for all commands ? */
 	for (i = 0; i < ncmds; i++)
 		n_bios += (sizes[i] + maxio - 1) / maxio;
 
 	/* Allocate memory for the bios */
 	bios_size = n_bios * sizeof (struct bio *);
 	bios = kmem_zalloc(bios_size, KM_SLEEP);
 
 	/* Prepare and issue all of the bios */
 	for (i = j = 0; i < ncmds; i++) {
 		off = offsets[i];
 		p = datas[i];
 		s = sizes[i];
 		end = off + s;
 		ASSERT0(off % cp->provider->sectorsize);
 		ASSERT0(s % cp->provider->sectorsize);
 
 		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
 			bios[j] = g_alloc_bio();
 			bios[j]->bio_cmd = cmds[i];
 			bios[j]->bio_done = NULL;
 			bios[j]->bio_offset = off;
 			bios[j]->bio_length = MIN(s, maxio);
 			bios[j]->bio_data = (caddr_t)p;
 			g_io_request(bios[j], cp);
 		}
 	}
 	ASSERT3S(j, ==, n_bios);
 
 	/* Wait for all of the bios to complete, and clean them up */
 	for (i = j = 0; i < ncmds; i++) {
 		off = offsets[i];
 		s = sizes[i];
 		end = off + s;
 
 		for (; off < end; off += maxio, s -= maxio, j++) {
 			errors[i] = biowait(bios[j], "vdev_geom_io") ||
 			    errors[i];
 			g_destroy_bio(bios[j]);
 		}
 	}
 	kmem_free(bios, bios_size);
 }
 
 /*
  * Read the vdev config from a device.  Return the number of valid labels that
  * were found.  The vdev config will be returned in config if and only if at
  * least one valid label was found.
  */
 static int
 vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
 {
 	struct g_provider *pp;
 	nvlist_t *config;
 	vdev_phys_t *vdev_lists[VDEV_LABELS];
 	char *buf;
 	size_t buflen;
 	uint64_t psize, state, txg;
 	off_t offsets[VDEV_LABELS];
 	off_t size;
 	off_t sizes[VDEV_LABELS];
 	int cmds[VDEV_LABELS];
 	int errors[VDEV_LABELS];
 	int l, nlabels;
 
 	g_topology_assert_not();
 
 	pp = cp->provider;
 	ZFS_LOG(1, "Reading config from %s...", pp->name);
 
 	psize = pp->mediasize;
 	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
 
 	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
 	    ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
 
 	buflen = sizeof (vdev_lists[0]->vp_nvlist);
 
 	/* Create all of the IO requests */
 	for (l = 0; l < VDEV_LABELS; l++) {
 		cmds[l] = BIO_READ;
 		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
 		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
 		sizes[l] = size;
 		errors[l] = 0;
 		ASSERT0(offsets[l] % pp->sectorsize);
 	}
 
 	/* Issue the IO requests */
 	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
 	    VDEV_LABELS);
 
 	/* Parse the labels */
 	config = *configp = NULL;
 	nlabels = 0;
 	for (l = 0; l < VDEV_LABELS; l++) {
 		if (errors[l] != 0)
 			continue;
 
 		buf = vdev_lists[l]->vp_nvlist;
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
 			continue;
 
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    &state) != 0 || state > POOL_STATE_L2CACHE) {
 			nvlist_free(config);
 			continue;
 		}
 
 		if (state != POOL_STATE_SPARE &&
 		    state != POOL_STATE_L2CACHE &&
 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 		    &txg) != 0 || txg == 0)) {
 			nvlist_free(config);
 			continue;
 		}
 
 		if (*configp != NULL)
 			nvlist_free(*configp);
 		*configp = config;
 		nlabels++;
 	}
 
 	/* Free the label storage */
 	for (l = 0; l < VDEV_LABELS; l++)
 		kmem_free(vdev_lists[l], size);
 
 	return (nlabels);
 }
 
 static void
 resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
 {
 	nvlist_t **new_configs;
 	uint64_t i;
 
 	if (id < *count)
 		return;
 	new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
 	    KM_SLEEP);
 	for (i = 0; i < *count; i++)
 		new_configs[i] = (*configs)[i];
 	if (*configs != NULL)
 		kmem_free(*configs, *count * sizeof (void *));
 	*configs = new_configs;
 	*count = id + 1;
 }
 
 static void
 process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
     const char *name, uint64_t *known_pool_guid)
 {
 	nvlist_t *vdev_tree;
 	uint64_t pool_guid;
 	uint64_t vdev_guid;
 	uint64_t id, txg, known_txg;
 	const char *pname;
 
 	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
 	    strcmp(pname, name) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
 		goto ignore;
 
 	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
 		goto ignore;
 
 	txg = fnvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG);
 
 	if (*known_pool_guid != 0) {
 		if (pool_guid != *known_pool_guid)
 			goto ignore;
 	} else
 		*known_pool_guid = pool_guid;
 
 	resize_configs(configs, count, id);
 
 	if ((*configs)[id] != NULL) {
 		known_txg = fnvlist_lookup_uint64((*configs)[id],
 		    ZPOOL_CONFIG_POOL_TXG);
 		if (txg <= known_txg)
 			goto ignore;
 		nvlist_free((*configs)[id]);
 	}
 
 	(*configs)[id] = cfg;
 	return;
 
 ignore:
 	nvlist_free(cfg);
 }
 
 int
 vdev_geom_read_pool_label(const char *name,
     nvlist_t ***configs, uint64_t *count)
 {
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *zcp;
 	nvlist_t *vdev_cfg;
 	uint64_t pool_guid;
 	int nlabels;
 
 	DROP_GIANT();
 	g_topology_lock();
 
 	*configs = NULL;
 	*count = 0;
 	pool_guid = 0;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
 		LIST_FOREACH(gp, &mp->geom, geom) {
 			if (gp->flags & G_GEOM_WITHER)
 				continue;
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				if (pp->flags & G_PF_WITHER)
 					continue;
 				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
 				if (zcp == NULL)
 					continue;
 				g_topology_unlock();
 				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
 				g_topology_lock();
 				vdev_geom_detach(zcp, B_TRUE);
 				if (nlabels == 0)
 					continue;
 				ZFS_LOG(1, "successfully read vdev config");
 
 				process_vdev_config(configs, count,
 				    vdev_cfg, name, &pool_guid);
 			}
 		}
 	}
 	g_topology_unlock();
 	PICKUP_GIANT();
 
 	return (*count > 0 ? 0 : ENOENT);
 }
 
 enum match {
 	NO_MATCH = 0,		/* No matching labels found */
 	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid */
 	ZERO_MATCH = 1,		/* Should never be returned */
 	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
 	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
 	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
 	FULL_MATCH = 5		/* all labels match the vdev_guid */
 };
 
 static enum match
 vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
 {
 	nvlist_t *config;
 	uint64_t pool_guid, top_guid, vdev_guid;
 	struct g_consumer *cp;
 	int nlabels;
 
 	cp = vdev_geom_attach(pp, NULL, B_TRUE);
 	if (cp == NULL) {
 		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
 		    pp->name);
 		return (NO_MATCH);
 	}
 	g_topology_unlock();
 	nlabels = vdev_geom_read_config(cp, &config);
 	g_topology_lock();
 	vdev_geom_detach(cp, B_TRUE);
 	if (nlabels == 0) {
 		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
 		return (NO_MATCH);
 	}
 
 	pool_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
 	top_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
 	vdev_guid = 0;
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
 	nvlist_free(config);
 
 	/*
 	 * Check that the label's pool guid matches the desired guid.
 	 * Inactive spares and L2ARCs do not have any pool guid in the label.
 	 */
 	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
 		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
 		    pp->name,
 		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
 		return (NO_MATCH);
 	}
 
 	/*
 	 * Check that the label's vdev guid matches the desired guid.
 	 * The second condition handles possible race on vdev detach, when
 	 * remaining vdev receives GUID of destroyed top level mirror vdev.
 	 */
 	if (vdev_guid == vd->vdev_guid) {
 		ZFS_LOG(1, "guids match for provider %s.", pp->name);
 		return (ZERO_MATCH + nlabels);
 	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
 		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
 		return (TOPGUID_MATCH);
 	}
 	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
 	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
 	return (NO_MATCH);
 }
 
 static struct g_consumer *
 vdev_geom_attach_by_guids(vdev_t *vd)
 {
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct g_provider *pp, *best_pp;
 	struct g_consumer *cp;
 	const char *vdpath;
 	enum match match, best_match;
 
 	g_topology_assert();
 
 	vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
 	cp = NULL;
 	best_pp = NULL;
 	best_match = NO_MATCH;
 	LIST_FOREACH(mp, &g_classes, class) {
 		if (mp == &zfs_vdev_class)
 			continue;
 		LIST_FOREACH(gp, &mp->geom, geom) {
 			if (gp->flags & G_GEOM_WITHER)
 				continue;
 			LIST_FOREACH(pp, &gp->provider, provider) {
 				match = vdev_attach_ok(vd, pp);
 				if (match > best_match) {
 					best_match = match;
 					best_pp = pp;
 				} else if (match == best_match) {
 					if (strcmp(pp->name, vdpath) == 0) {
 						best_pp = pp;
 					}
 				}
 				if (match == FULL_MATCH)
 					goto out;
 			}
 		}
 	}
 
 out:
 	if (best_pp) {
 		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
 		if (cp == NULL) {
 			printf("ZFS WARNING: Unable to attach to %s.\n",
 			    best_pp->name);
 		}
 	}
 	return (cp);
 }
 
 static struct g_consumer *
 vdev_geom_open_by_guids(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	char *buf;
 	size_t len;
 
 	g_topology_assert();
 
 	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
 	    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
 	cp = vdev_geom_attach_by_guids(vd);
 	if (cp != NULL) {
 		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
 		buf = kmem_alloc(len, KM_SLEEP);
 
 		snprintf(buf, len, "/dev/%s", cp->provider->name);
 		spa_strfree(vd->vdev_path);
 		vd->vdev_path = buf;
 
 		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
 		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid, cp->provider->name);
 	} else {
 		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
 		    (uintmax_t)spa_guid(vd->vdev_spa),
 		    (uintmax_t)vd->vdev_guid);
 	}
 
 	return (cp);
 }
 
 static struct g_consumer *
 vdev_geom_open_by_path(vdev_t *vd, int check_guid)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = NULL;
 	pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
 	if (pp != NULL) {
 		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
 		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
 			cp = vdev_geom_attach(pp, vd, B_FALSE);
 	}
 
 	return (cp);
 }
 
 static int
 vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	int error, has_trim;
 	uint16_t rate;
 
 	/*
 	 * Set the TLS to indicate downstack that we
 	 * should not access zvols
 	 */
 	VERIFY0(tsd_set(zfs_geom_probe_vdev_key, vd));
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (EINVAL);
 	}
 
 	/*
 	 * Reopen the device if it's not currently open. Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if ((cp = vd->vdev_tsd) != NULL) {
 		ASSERT(vd->vdev_reopening);
 		goto skip_open;
 	}
 
 	DROP_GIANT();
 	g_topology_lock();
 	error = 0;
 
 	if (vd->vdev_spa->spa_is_splitting ||
 	    ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
 	    (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
 	    vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
 		/*
 		 * We are dealing with a vdev that hasn't been previously
 		 * opened (since boot), and we are not loading an
 		 * existing pool configuration.  This looks like a
 		 * vdev add operation to a new or existing pool.
 		 * Assume the user really wants to do this, and find
 		 * GEOM provider by its name, ignoring GUID mismatches.
 		 *
 		 * XXPOLICY: It would be safer to only allow a device
 		 *           that is unlabeled or labeled but missing
 		 *           GUID information to be opened in this fashion,
 		 *           unless we are doing a split, in which case we
 		 *           should allow any guid.
 		 */
 		cp = vdev_geom_open_by_path(vd, 0);
 	} else {
 		/*
 		 * Try using the recorded path for this device, but only
 		 * accept it if its label data contains the expected GUIDs.
 		 */
 		cp = vdev_geom_open_by_path(vd, 1);
 		if (cp == NULL) {
 			/*
 			 * The device at vd->vdev_path doesn't have the
 			 * expected GUIDs. The disks might have merely
 			 * moved around so try all other GEOM providers
 			 * to find one with the right GUIDs.
 			 */
 			cp = vdev_geom_open_by_guids(vd);
 		}
 	}
 
 	/* Clear the TLS now that tasting is done */
 	VERIFY0(tsd_set(zfs_geom_probe_vdev_key, NULL));
 
 	if (cp == NULL) {
 		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
 		error = ENOENT;
 	} else {
 		struct consumer_priv_t *priv;
 		struct consumer_vdev_elem *elem;
 		int spamode;
 
 		priv = (struct consumer_priv_t *)&cp->private;
 		if (cp->private == NULL)
 			SLIST_INIT(priv);
 		elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
 		elem->vd = vd;
 		SLIST_INSERT_HEAD(priv, elem, elems);
 
 		spamode = spa_mode(vd->vdev_spa);
 		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
 		    !ISP2(cp->provider->sectorsize)) {
 			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
 			    cp->provider->name);
 
 			vdev_geom_close_locked(vd);
 			error = EINVAL;
 			cp = NULL;
 		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
 			int i;
 
 			for (i = 0; i < 5; i++) {
 				error = g_access(cp, 0, 1, 0);
 				if (error == 0)
 					break;
 				g_topology_unlock();
 				tsleep(vd, 0, "vdev", hz / 2);
 				g_topology_lock();
 			}
 			if (error != 0) {
 				printf("ZFS WARNING: Unable to open %s for "
 				    "writing (error=%d).\n",
 				    cp->provider->name, error);
 				vdev_geom_close_locked(vd);
 				cp = NULL;
 			}
 		}
 	}
 
 	/* Fetch initial physical path information for this device. */
 	if (cp != NULL) {
 		vdev_geom_attrchanged(cp, "GEOM::physpath");
 
 		/* Set other GEOM characteristics */
 		vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
 	}
 
 	g_topology_unlock();
 	PICKUP_GIANT();
 	if (cp == NULL) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
 		    error);
 		return (error);
 	}
 skip_open:
 	pp = cp->provider;
 
 	/*
 	 * Determine the actual size of the device.
 	 */
 	*max_psize = *psize = pp->mediasize;
 
 	/*
 	 * Determine the device's minimum transfer size and preferred
 	 * transfer size.
 	 */
 	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
 	*physical_ashift = 0;
 	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
 	    ISP2(pp->stripesize) && pp->stripeoffset == 0)
 		*physical_ashift = highbit(pp->stripesize) - 1;
 
 	/*
 	 * Clear the nowritecache settings, so that on a vdev_reopen()
 	 * we will try again.
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
 	/* Inform the ZIO pipeline that we are non-rotational. */
 	error = g_getattr("GEOM::rotation_rate", cp, &rate);
 	if (error == 0 && rate == DISK_RR_NON_ROTATING)
 		vd->vdev_nonrot = B_TRUE;
 	else
 		vd->vdev_nonrot = B_FALSE;
 
 	/* Set when device reports it supports TRIM. */
 	error = g_getattr("GEOM::candelete", cp, &has_trim);
 	vd->vdev_has_trim = (error == 0 && has_trim);
 
 	/* Set when device reports it supports secure TRIM. */
 	/* unavailable on FreeBSD */
 	vd->vdev_has_securetrim = B_FALSE;
 
 	return (0);
 }
 
 static void
 vdev_geom_close(vdev_t *vd)
 {
 	struct g_consumer *cp;
 	boolean_t locked;
 
 	cp = vd->vdev_tsd;
 
 	DROP_GIANT();
 	locked = g_topology_locked();
 	if (!locked)
 		g_topology_lock();
 
 	if (!vd->vdev_reopening ||
 	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
 	    (cp->provider != NULL && cp->provider->error != 0))))
 		vdev_geom_close_locked(vd);
 
 	if (!locked)
 		g_topology_unlock();
 	PICKUP_GIANT();
 }
 
 static void
 vdev_geom_io_intr(struct bio *bp)
 {
 	vdev_t *vd;
 	zio_t *zio;
 
 	zio = bp->bio_caller1;
 	vd = zio->io_vd;
 	zio->io_error = bp->bio_error;
 	if (zio->io_error == 0 && bp->bio_resid != 0)
 		zio->io_error = SET_ERROR(EIO);
 
 	switch (zio->io_error) {
 	case ENXIO:
 		if (!vd->vdev_remove_wanted) {
 			/*
 			 * If provider's error is set we assume it is being
 			 * removed.
 			 */
 			if (bp->bio_to->error != 0) {
 				vd->vdev_remove_wanted = B_TRUE;
 				spa_async_request(zio->io_spa,
 				    SPA_ASYNC_REMOVE);
 			} else if (!vd->vdev_delayed_close) {
 				vd->vdev_delayed_close = B_TRUE;
 			}
 		}
 		break;
 	}
 
 	/*
 	 * We have to split bio freeing into two parts, because the ABD code
 	 * cannot be called in this context and vdev_op_io_done is not called
 	 * for ZIO_TYPE_FLUSH zio-s.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		g_destroy_bio(bp);
 		zio->io_bio = NULL;
 	}
 	zio_delay_interrupt(zio);
 }
 
 struct vdev_geom_check_unmapped_cb_state {
 	int	pages;
 	uint_t	end;
 };
 
 /*
  * Callback to check the ABD segment size/alignment and count the pages.
  * GEOM requires data buffer to look virtually contiguous.  It means only
  * the first page of the buffer may not start and only the last may not
  * end on a page boundary.  All other physical pages must be full.
  */
 static int
 vdev_geom_check_unmapped_cb(void *buf, size_t len, void *priv)
 {
 	struct vdev_geom_check_unmapped_cb_state *s = priv;
 	vm_offset_t off = (vm_offset_t)buf & PAGE_MASK;
 
 	if (s->pages != 0 && off != 0)
 		return (1);
 	if (s->end != 0)
 		return (1);
 	s->end = (off + len) & PAGE_MASK;
 	s->pages += (off + len + PAGE_MASK) >> PAGE_SHIFT;
 	return (0);
 }
 
 /*
  * Check whether we can use unmapped I/O for this ZIO on this device to
  * avoid data copying between scattered and/or gang ABD buffer and linear.
  */
 static int
 vdev_geom_check_unmapped(zio_t *zio, struct g_consumer *cp)
 {
 	struct vdev_geom_check_unmapped_cb_state s;
 
 	/* If unmapped I/O is administratively disabled, respect that. */
 	if (!unmapped_buf_allowed)
 		return (0);
 
 	/* If the buffer is already linear, then nothing to do here. */
 	if (abd_is_linear(zio->io_abd))
 		return (0);
 
 	/*
 	 * If unmapped I/O is not supported by the GEOM provider,
 	 * then we can't do anything and have to copy the data.
 	 */
 	if ((cp->provider->flags & G_PF_ACCEPT_UNMAPPED) == 0)
 		return (0);
 
 	/* Check the buffer chunks sizes/alignments and count pages. */
 	s.pages = s.end = 0;
 	if (abd_iterate_func(zio->io_abd, 0, zio->io_size,
 	    vdev_geom_check_unmapped_cb, &s))
 		return (0);
 	return (s.pages);
 }
 
 /*
  * Callback to translate the ABD segment into array of physical pages.
  */
 static int
 vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv)
 {
 	struct bio *bp = priv;
 	vm_offset_t addr = (vm_offset_t)buf;
 	vm_offset_t end = addr + len;
 
 	if (bp->bio_ma_n == 0) {
 		bp->bio_ma_offset = addr & PAGE_MASK;
 		addr &= ~PAGE_MASK;
 	} else {
 		ASSERT0(P2PHASE(addr, PAGE_SIZE));
 	}
 	do {
 		bp->bio_ma[bp->bio_ma_n++] =
 		    PHYS_TO_VM_PAGE(pmap_kextract(addr));
 		addr += PAGE_SIZE;
 	} while (addr < end);
 	return (0);
 }
 
 static void
 vdev_geom_io_start(zio_t *zio)
 {
 	vdev_t *vd;
 	struct g_consumer *cp;
 	struct bio *bp;
 
 	vd = zio->io_vd;
 
 	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
 		if (zfs_nocacheflush || vdev_geom_bio_flush_disable) {
 			zio_execute(zio);
 			return;
 		}
 
 		if (vd->vdev_nowritecache) {
 			zio->io_error = SET_ERROR(ENOTSUP);
 			zio_execute(zio);
 			return;
 		}
 	} else if (zio->io_type == ZIO_TYPE_TRIM) {
 		if (vdev_geom_bio_delete_disable) {
 			zio_execute(zio);
 			return;
 		}
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM ||
 	    zio->io_type == ZIO_TYPE_FLUSH);
 
 	cp = vd->vdev_tsd;
 	if (cp == NULL) {
 		zio->io_error = SET_ERROR(ENXIO);
 		zio_interrupt(zio);
 		return;
 	}
 	bp = g_alloc_bio();
 	bp->bio_caller1 = zio;
 	switch (zio->io_type) {
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		if (zio->io_type == ZIO_TYPE_READ)
 			bp->bio_cmd = BIO_READ;
 		else
 			bp->bio_cmd = BIO_WRITE;
 
 		/*
 		 * If possible, represent scattered and/or gang ABD buffer to
 		 * GEOM as an array of physical pages.  It allows to satisfy
 		 * requirement of virtually contiguous buffer without copying.
 		 */
 		int pgs = vdev_geom_check_unmapped(zio, cp);
 		if (pgs > 0) {
 			bp->bio_ma = malloc(sizeof (struct vm_page *) * pgs,
 			    M_DEVBUF, M_WAITOK);
 			bp->bio_ma_n = 0;
 			bp->bio_ma_offset = 0;
 			abd_iterate_func(zio->io_abd, 0, zio->io_size,
 			    vdev_geom_fill_unmap_cb, bp);
 			bp->bio_data = unmapped_buf;
 			bp->bio_flags |= BIO_UNMAPPED;
 		} else {
 			if (zio->io_type == ZIO_TYPE_READ) {
 				bp->bio_data = abd_borrow_buf(zio->io_abd,
 				    zio->io_size);
 			} else {
 				bp->bio_data = abd_borrow_buf_copy(zio->io_abd,
 				    zio->io_size);
 			}
 		}
 		break;
 	case ZIO_TYPE_TRIM:
 		bp->bio_cmd = BIO_DELETE;
 		bp->bio_data = NULL;
 		bp->bio_offset = zio->io_offset;
 		bp->bio_length = zio->io_size;
 		break;
 	case ZIO_TYPE_FLUSH:
 		bp->bio_cmd = BIO_FLUSH;
 		bp->bio_data = NULL;
 		bp->bio_offset = cp->provider->mediasize;
 		bp->bio_length = 0;
 		break;
 	default:
 		panic("invalid zio->io_type: %d\n", zio->io_type);
 	}
 	bp->bio_done = vdev_geom_io_intr;
 	zio->io_bio = bp;
 
 	g_io_request(bp, cp);
 }
 
 static void
 vdev_geom_io_done(zio_t *zio)
 {
 	struct bio *bp = zio->io_bio;
 
 	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
 		ASSERT3P(bp, ==, NULL);
 		return;
 	}
 
 	if (bp == NULL) {
 		ASSERT3S(zio->io_error, ==, ENXIO);
 		return;
 	}
 
 	if (bp->bio_ma != NULL) {
 		free(bp->bio_ma, M_DEVBUF);
 	} else {
 		if (zio->io_type == ZIO_TYPE_READ) {
 			abd_return_buf_copy(zio->io_abd, bp->bio_data,
 			    zio->io_size);
 		} else {
 			abd_return_buf(zio->io_abd, bp->bio_data,
 			    zio->io_size);
 		}
 	}
 
 	g_destroy_bio(bp);
 	zio->io_bio = NULL;
 }
 
 static void
 vdev_geom_hold(vdev_t *vd)
 {
 }
 
 static void
 vdev_geom_rele(vdev_t *vd)
 {
 }
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_geom_open,
 	.vdev_op_close = vdev_geom_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_geom_io_start,
 	.vdev_op_io_done = vdev_geom_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_geom_hold,
 	.vdev_op_rele = vdev_geom_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 29e54b39aa1a..face4611d66c 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1,1652 +1,1653 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <linux/blkpg.h>
 #include <linux/msdos_fs.h>
 #include <linux/vfs_compat.h>
 #include <linux/blk-cgroup.h>
 
 /*
  * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying
  * block_device. Since it carries the block_device inside, its convenient to
  * just use the handle as a proxy.
  *
  * Linux 6.9.x uses a file for the same purpose.
  *
  * For pre-6.8, we just emulate this with a cast, since we don't need any of
  * the other fields inside the handle.
  */
 #if defined(HAVE_BDEV_OPEN_BY_PATH)
 typedef struct bdev_handle zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((bdh)->bdev)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 typedef struct file zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		(file_bdev(bdh))
 #define	BDH_IS_ERR(bdh)		(IS_ERR(bdh))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(bdh))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #else
 typedef void zfs_bdev_handle_t;
 #define	BDH_BDEV(bdh)		((struct block_device *)bdh)
 #define	BDH_IS_ERR(bdh)		(IS_ERR(BDH_BDEV(bdh)))
 #define	BDH_PTR_ERR(bdh)	(PTR_ERR(BDH_BDEV(bdh)))
 #define	BDH_ERR_PTR(err)	(ERR_PTR(err))
 #endif
 
 typedef struct vdev_disk {
 	zfs_bdev_handle_t		*vd_bdh;
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
 /*
  * Maximum number of segments to add to a bio (min 4). If this is higher than
  * the maximum allowed by the device queue or the kernel itself, it will be
  * clamped. Setting it to zero will cause the kernel's ideal size to be used.
  */
 uint_t zfs_vdev_disk_max_segs = 0;
 
 /*
  * Unique identifier for the exclusive vdev holder.
  */
 static void *zfs_vdev_holder = VDEV_HOLDER;
 
 /*
  * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
  * device is missing. The missing path may be transient since the links
  * can be briefly removed and recreated in response to udev events.
  */
 static uint_t zfs_vdev_open_timeout_ms = 1000;
 
 /*
  * Size of the "reserved" partition, in blocks.
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
 /*
  * BIO request failfast mask.
  */
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
 /*
  * Convert SPA mode flags into bdev open mode flags.
  */
 #ifdef HAVE_BLK_MODE_T
 typedef blk_mode_t vdev_bdev_mode_t;
 #define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
 #define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
 #define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
 #define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
 #else
 typedef fmode_t vdev_bdev_mode_t;
 #define	VDEV_BDEV_MODE_READ	FMODE_READ
 #define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
 #define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
 #define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
 #endif
 
 static vdev_bdev_mode_t
 vdev_bdev_mode(spa_mode_t smode)
 {
 	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
 	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
 
 	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
 
 	if (smode & SPA_MODE_READ)
 		bmode |= VDEV_BDEV_MODE_READ;
 
 	if (smode & SPA_MODE_WRITE)
 		bmode |= VDEV_BDEV_MODE_WRITE;
 
 	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
 	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
 
 	return (bmode);
 }
 
 /*
  * Returns the usable capacity (in bytes) for the partition or disk.
  */
 static uint64_t
 bdev_capacity(struct block_device *bdev)
 {
 #ifdef HAVE_BDEV_NR_BYTES
 	return (bdev_nr_bytes(bdev));
 #else
 	return (i_size_read(bdev->bd_inode));
 #endif
 }
 
 #if !defined(HAVE_BDEV_WHOLE)
 static inline struct block_device *
 bdev_whole(struct block_device *bdev)
 {
 	return (bdev->bd_contains);
 }
 #endif
 
 #if defined(HAVE_BDEVNAME)
 #define	vdev_bdevname(bdev, name)	bdevname(bdev, name)
 #else
 static inline void
 vdev_bdevname(struct block_device *bdev, char *name)
 {
 	snprintf(name, BDEVNAME_SIZE, "%pg", bdev);
 }
 #endif
 
 /*
  * Returns the maximum expansion capacity of the block device (in bytes).
  *
  * It is possible to expand a vdev when it has been created as a wholedisk
  * and the containing block device has increased in capacity.  Or when the
  * partition containing the pool has been manually increased in size.
  *
  * This function is only responsible for calculating the potential expansion
  * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
  * responsible for verifying the expected partition layout in the wholedisk
  * case, and updating the partition table if appropriate.  Once the partition
  * size has been increased the additional capacity will be visible using
  * bdev_capacity().
  *
  * The returned maximum expansion capacity is always expected to be larger, or
  * at the very least equal, to its usable capacity to prevent overestimating
  * the pool expandsize.
  */
 static uint64_t
 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
 {
 	uint64_t psize;
 	int64_t available;
 
 	if (wholedisk && bdev != bdev_whole(bdev)) {
 		/*
 		 * When reporting maximum expansion capacity for a wholedisk
 		 * deduct any capacity which is expected to be lost due to
 		 * alignment restrictions.  Over reporting this value isn't
 		 * harmful and would only result in slightly less capacity
 		 * than expected post expansion.
 		 * The estimated available space may be slightly smaller than
 		 * bdev_capacity() for devices where the number of sectors is
 		 * not a multiple of the alignment size and the partition layout
 		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
 		 * "reserved" EFI partition: in such cases return the device
 		 * usable capacity.
 		 */
 		available = bdev_capacity(bdev_whole(bdev)) -
 		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
 		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
 		psize = MAX(available, bdev_capacity(bdev));
 	} else {
 		psize = bdev_capacity(bdev);
 	}
 
 	return (psize);
 }
 
 static void
 vdev_disk_error(zio_t *zio)
 {
 	/*
 	 * This function can be called in interrupt context, for instance while
 	 * handling IRQs coming from a misbehaving disk device; use printk()
 	 * which is safe from any context.
 	 */
 	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
 	    "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa),
 	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
 	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
 	    zio->io_flags);
 }
 
 static void
 vdev_disk_kobj_evt_post(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 	if (vd && vd->vd_bdh) {
 		spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh));
 	} else {
 		vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n",
 		    v->vdev_path);
 	}
 }
 
 static zfs_bdev_handle_t *
 vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
 	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
 
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 	return (bdev_file_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BDEV_OPEN_BY_PATH)
 	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
 	return (blkdev_get_by_path(path, bmode, holder, NULL));
 #else
 	return (blkdev_get_by_path(path, bmode, holder));
 #endif
 }
 
 static void
 vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 {
 #if defined(HAVE_BDEV_RELEASE)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
 #elif defined(HAVE_BLKDEV_PUT)
 	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
 #else
 	fput(bdh);
 #endif
 }
 
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	zfs_bdev_handle_t *bdh;
 	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
 		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		vdev_dbgmsg(v, "invalid vdev_path");
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it is currently open.  When expanding a
 	 * partition force re-scanning the partition table if userland
 	 * did not take care of this already. We need to do this while closed
 	 * in order to get an accurate updated block device size.  Then
 	 * since udev may need to recreate the device links increase the
 	 * open retry timeout before reporting the device as unavailable.
 	 */
 	vd = v->vdev_tsd;
 	if (vd) {
 		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
 		boolean_t reread_part = B_FALSE;
 
 		rw_enter(&vd->vd_lock, RW_WRITER);
 		bdh = vd->vd_bdh;
 		vd->vd_bdh = NULL;
 
 		if (bdh) {
 			struct block_device *bdev = BDH_BDEV(bdh);
 			if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
 				vdev_bdevname(bdev_whole(bdev), disk_name + 5);
 				/*
 				 * If userland has BLKPG_RESIZE_PARTITION,
 				 * then it should have updated the partition
 				 * table already. We can detect this by
 				 * comparing our current physical size
 				 * with that of the device. If they are
 				 * the same, then we must not have
 				 * BLKPG_RESIZE_PARTITION or it failed to
 				 * update the partition table online. We
 				 * fallback to rescanning the partition
 				 * table from the kernel below. However,
 				 * if the capacity already reflects the
 				 * updated partition, then we skip
 				 * rescanning the partition table here.
 				 */
 				if (v->vdev_psize == bdev_capacity(bdev))
 					reread_part = B_TRUE;
 			}
 
 			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
 			bdh = vdev_blkdev_get_by_path(disk_name, smode,
 			    zfs_vdev_holder);
 			if (!BDH_IS_ERR(bdh)) {
 				int error =
 				    vdev_bdev_reread_part(BDH_BDEV(bdh));
 				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
 				}
 			}
 		}
 	} else {
 		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
 
 		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
 		rw_enter(&vd->vd_lock, RW_WRITER);
 	}
 
 	/*
 	 * Devices are always opened by the path provided at configuration
 	 * time.  This means that if the provided path is a udev by-id path
 	 * then drives may be re-cabled without an issue.  If the provided
 	 * path is a udev by-path path, then the physical location information
 	 * will be preserved.  This can be critical for more complicated
 	 * configurations where drives are located in specific physical
 	 * locations to maximize the systems tolerance to component failure.
 	 *
 	 * Alternatively, you can provide your own udev rule to flexibly map
 	 * the drives as you see fit.  It is not advised that you use the
 	 * /dev/[hd]d devices which may be reordered due to probing order.
 	 * Devices in the wrong locations will be detected by the higher
 	 * level vdev validation.
 	 *
 	 * The specified paths may be briefly removed and recreated in
 	 * response to udev events.  This should be exceptionally unlikely
 	 * because the zpool command makes every effort to verify these paths
 	 * have already settled prior to reaching this point.  Therefore,
 	 * a ENOENT failure at this point is highly likely to be transient
 	 * and it is reasonable to sleep and retry before giving up.  In
 	 * practice delays have been observed to be on the order of 100ms.
 	 *
 	 * When ERESTARTSYS is returned it indicates the block device is
 	 * a zvol which could not be opened due to the deadlock detection
 	 * logic in zvol_open().  Extend the timeout and retry the open
 	 * subsequent attempts are expected to eventually succeed.
 	 */
 	hrtime_t start = gethrtime();
 	bdh = BDH_ERR_PTR(-ENXIO);
 	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
 		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
 		    zfs_vdev_holder);
 		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
 			/*
 			 * There is no point of waiting since device is removed
 			 * explicitly
 			 */
 			if (v->vdev_removed)
 				break;
 
 			schedule_timeout_interruptible(MSEC_TO_TICK(10));
 		} else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) {
 			timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10);
 			continue;
 		} else if (BDH_IS_ERR(bdh)) {
 			break;
 		}
 	}
 
 	if (BDH_IS_ERR(bdh)) {
 		int error = -BDH_PTR_ERR(bdh);
 		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
 		    (u_longlong_t)(gethrtime() - start),
 		    (u_longlong_t)timeout);
 		vd->vd_bdh = NULL;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 		return (SET_ERROR(error));
 	} else {
 		vd->vd_bdh = bdh;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
 	}
 
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 
 	/*  Determine the physical block size */
 	int physical_block_size = bdev_physical_block_size(bdev);
 
 	/*  Determine the logical block size */
 	int logical_block_size = bdev_logical_block_size(bdev);
 
 	/*
 	 * If the device has a write cache, clear the nowritecache flag,
 	 * so that we start issuing flush requests again.
 	 */
 	v->vdev_nowritecache = !zfs_bdev_has_write_cache(bdev);
 
 	/* Set when device reports it supports TRIM. */
 	v->vdev_has_trim = bdev_discard_supported(bdev);
 
 	/* Set when device reports it supports secure TRIM. */
 	v->vdev_has_securetrim = bdev_secure_discard_supported(bdev);
 
 	/* Inform the ZIO pipeline that we are non-rotational */
 	v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
 
 	/* Physical volume size in bytes for the partition */
 	*psize = bdev_capacity(bdev);
 
 	/* Physical volume size in bytes including possible expansion space */
 	*max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk);
 
 	/* Based on the minimum sector size set the block size */
 	*physical_ashift = highbit64(MAX(physical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	*logical_ashift = highbit64(MAX(logical_block_size,
 	    SPA_MINBLOCKSIZE)) - 1;
 
 	return (0);
 }
 
 static void
 vdev_disk_close(vdev_t *v)
 {
 	vdev_disk_t *vd = v->vdev_tsd;
 
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
 	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
 	v->vdev_tsd = NULL;
 }
 
 /*
  * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
  * replace it with preempt_schedule under the following condition:
  */
 #if defined(CONFIG_ARM64) && \
     defined(CONFIG_PREEMPTION) && \
     defined(CONFIG_BLK_CGROUP)
 #define	preempt_schedule_notrace(x) preempt_schedule(x)
 #endif
 
 /*
  * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct
  * as an argument removing the need to set it with bio_set_dev().  This
  * removes the need for all of the following compatibility code.
  */
 #if !defined(HAVE_BIO_ALLOC_4ARG)
 
 #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
 /*
  * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
  * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
  * As a side effect the function was converted to GPL-only.  Define our
  * own version when needed which uses rcu_read_lock_sched().
  *
  * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public
  * part, moving blkg_tryget into the private one. Define our own version.
  */
 #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET)
 static inline bool
 vdev_blkg_tryget(struct blkcg_gq *blkg)
 {
 	struct percpu_ref *ref = &blkg->refcnt;
 	unsigned long __percpu *count;
 	bool rc;
 
 	rcu_read_lock_sched();
 
 	if (__ref_is_percpu(ref, &count)) {
 		this_cpu_inc(*count);
 		rc = true;
 	} else {
 #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
 		rc = atomic_long_inc_not_zero(&ref->data->count);
 #else
 		rc = atomic_long_inc_not_zero(&ref->count);
 #endif
 	}
 
 	rcu_read_unlock_sched();
 
 	return (rc);
 }
 #else
 #define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
 #endif
 #ifdef HAVE_BIO_SET_DEV_MACRO
 /*
  * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
  * GPL-only bio_associate_blkg() symbol thus inadvertently converting
  * the entire macro.  Provide a minimal version which always assigns the
  * request queue's root_blkg to the bio.
  */
 static inline void
 vdev_bio_associate_blkg(struct bio *bio)
 {
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 
 	ASSERT3P(q, !=, NULL);
 	ASSERT3P(bio->bi_blkg, ==, NULL);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
 }
 
 #define	bio_associate_blkg vdev_bio_associate_blkg
 #else
 static inline void
 vdev_bio_set_dev(struct bio *bio, struct block_device *bdev)
 {
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 	bio_clear_flag(bio, BIO_REMAPPED);
 	if (bio->bi_bdev != bdev)
 		bio_clear_flag(bio, BIO_THROTTLED);
 	bio->bi_bdev = bdev;
 
 	ASSERT3P(q, !=, NULL);
 	ASSERT3P(bio->bi_blkg, ==, NULL);
 
 	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
 		bio->bi_blkg = q->root_blkg;
 }
 #define	bio_set_dev		vdev_bio_set_dev
 #endif
 #endif
 #endif /* !HAVE_BIO_ALLOC_4ARG */
 
 static inline void
 vdev_submit_bio(struct bio *bio)
 {
 	struct bio_list *bio_list = current->bio_list;
 	current->bio_list = NULL;
 	(void) submit_bio(bio);
 	current->bio_list = bio_list;
 }
 
 static inline struct bio *
 vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
     unsigned short nr_vecs)
 {
 	struct bio *bio;
 
 #ifdef HAVE_BIO_ALLOC_4ARG
 	bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask);
 #else
 	bio = bio_alloc(gfp_mask, nr_vecs);
 	if (likely(bio != NULL))
 		bio_set_dev(bio, bdev);
 #endif
 
 	return (bio);
 }
 
 static inline uint_t
 vdev_bio_max_segs(struct block_device *bdev)
 {
 	/*
 	 * Smallest of the device max segs and the tuneable max segs. Minimum
 	 * 4, so there's room to finish split pages if they come up.
 	 */
 	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
 	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
 	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
 	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
 
 #ifdef HAVE_BIO_MAX_SEGS
 	return (bio_max_segs(max_segs));
 #else
 	return (MIN(max_segs, BIO_MAX_PAGES));
 #endif
 }
 
 static inline uint_t
 vdev_bio_max_bytes(struct block_device *bdev)
 {
 	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
 }
 
 
 /*
  * Virtual block IO object (VBIO)
  *
  * Linux block IO (BIO) objects have a limit on how many data segments (pages)
  * they can hold. Depending on how they're allocated and structured, a large
  * ZIO can require more than one BIO to be submitted to the kernel, which then
  * all have to complete before we can return the completed ZIO back to ZFS.
  *
  * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
  * translate a ZIO down into the kernel block layer and back again.
  *
  * Note that these are only used for data ZIOs (read/write). Meta-operations
  * (flush/trim) don't need multiple BIOs and so can just make the call
  * directly.
  */
 typedef struct {
 	zio_t		*vbio_zio;	/* parent zio */
 
 	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
 	uint_t		vbio_lbs_mask;	/* logical block size mask */
 
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
 	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
 	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
 	vbio->vbio_bio = NULL;
 	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
 static void vbio_completion(struct bio *bio);
 
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
 	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
 			VERIFY(bio);
 
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
 			bio_set_op_attrs(bio,
 			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
 			    WRITE : READ, vbio->vbio_flags);
 
 			if (vbio->vbio_bio) {
 				bio_chain(vbio->vbio_bio, bio);
 				vdev_submit_bio(vbio->vbio_bio);
 			}
 			vbio->vbio_bio = bio;
 		}
 
 		/*
 		 * Only load as much of the current page data as will fit in
 		 * the space left in the BIO, respecting lbs alignment. Older
 		 * kernels will error if we try to overfill the BIO, while
 		 * newer ones will accept it and split the BIO. This ensures
 		 * everything works on older kernels, and avoids an additional
 		 * overhead on the new.
 		 */
 		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
 		    vbio->vbio_lbs_mask);
 		if (ssize > 0 &&
 		    bio_add_page(bio, page, ssize, offset) == ssize) {
 			/* Accepted, adjust and load any remaining. */
 			size -= ssize;
 			offset += ssize;
 			continue;
 		}
 
 		/* No room, set up for a new BIO and loop */
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
 		bio = NULL;
 	}
 
 	return (0);
 }
 
 /* Iterator callback to submit ABD pages to the vbio. */
 static int
 vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 {
 	vbio_t *vbio = priv;
 	return (vbio_add_page(vbio, page, len, off));
 }
 
 /* Create some BIOs, fill them with data and submit them */
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
 	 * plug, then the kernel may start executing earlier BIOs while we're
 	 * still creating and executing later ones, and if the device goes
 	 * away while that's happening, older kernels can get confused and
 	 * trample memory.
 	 */
 	struct blk_plug plug;
 	blk_start_plug(&plug);
 
 	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
 	ASSERT(vbio->vbio_bio);
 
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 
 	/*
 	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
 	 * can't touch it again. The bio may complete and vbio_completion() be
 	 * called and free the vbio before this task is run again, so we must
 	 * consider it invalid from this point.
 	 */
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
 }
 
 /* IO completion callback */
 static void
 vbio_completion(struct bio *bio)
 {
 	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
 
 	ASSERT(zio);
 
 	/* Capture and log any errors */
 	zio->io_error = bi_status_to_errno(bio->bi_status);
 	ASSERT3U(zio->io_error, >=, 0);
 
 	if (zio->io_error)
 		vdev_disk_error(zio);
 
 	/* Return the BIO to the kernel */
 	bio_put(bio);
 
 	/*
 	 * We're likely in an interrupt context so we can't do ABD/memory work
 	 * here; instead we stash vbio on the zio and take care of it in the
 	 * done callback.
 	 */
 	ASSERT3P(zio->io_bio, ==, NULL);
 	zio->io_bio = vbio;
 
 	zio_delay_interrupt(zio);
 }
 
 /*
  * Iterator callback to count ABD pages and check their size & alignment.
  *
  * On Linux, each BIO segment can take a page pointer, and an offset+length of
  * the data within that page. A page can be arbitrarily large ("compound"
  * pages) but we still have to ensure the data portion is correctly sized and
  * aligned to the logical block size, to ensure that if the kernel wants to
  * split the BIO, the two halves will still be properly aligned.
  *
  * NOTE: if you change this function, change the copy in
  * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
  * data there to validate the change you're making.
  */
 typedef struct {
 	size_t	blocksize;
 	int	seen_first;
 	int	seen_last;
 } vdev_disk_check_alignment_t;
 
 static int
 vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
     void *priv)
 {
 	(void) page;
 	vdev_disk_check_alignment_t *s = priv;
 
 	/*
 	 * The cardinal rule: a single on-disk block must never cross an
 	 * physical (order-0) page boundary, as the kernel expects to be able
 	 * to split at both LBS and page boundaries.
 	 *
 	 * This implies various alignment rules for the blocks in this
 	 * (possibly compound) page, which we can check for.
 	 */
 
 	/*
 	 * If the previous page did not end on a page boundary, then we
 	 * can't proceed without creating a hole.
 	 */
 	if (s->seen_last)
 		return (1);
 
 	/* This page must contain only whole LBS-sized blocks. */
 	if (!IS_P2ALIGNED(len, s->blocksize))
 		return (1);
 
 	/*
 	 * If this is not the first page in the ABD, then the data must start
 	 * on a page-aligned boundary (so the kernel can split on page
 	 * boundaries without having to deal with a hole). If it is, then
 	 * it can start on LBS-alignment.
 	 */
 	if (s->seen_first) {
 		if (!IS_P2ALIGNED(off, PAGESIZE))
 			return (1);
 	} else {
 		if (!IS_P2ALIGNED(off, s->blocksize))
 			return (1);
 		s->seen_first = 1;
 	}
 
 	/*
 	 * If this data does not end on a page-aligned boundary, then this
 	 * must be the last page in the ABD, for the same reason.
 	 */
 	s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
 
 	return (0);
 }
 
 /*
  * Check if we can submit the pages in this ABD to the kernel as-is. Returns
  * the number of pages, or 0 if it can't be submitted like this.
  */
 static boolean_t
 vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
 {
 	vdev_disk_check_alignment_t s = {
 	    .blocksize = bdev_logical_block_size(bdev),
 	};
 
 	if (abd_iterate_page_func(abd, 0, size,
 	    vdev_disk_check_alignment_cb, &s))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 	int flags = 0;
 
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
 	if (zio->io_offset + zio->io_size > bdev_capacity(bdev)) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)zio->io_offset,
 		    (u_longlong_t)zio->io_size,
 		    (u_longlong_t)bdev_capacity(bdev));
 		return (SET_ERROR(EIO));
 	}
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    v->vdev_failfast == B_TRUE) {
 		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 	}
 
 	/*
 	 * Check alignment of the incoming ABD. If any part of it would require
 	 * submitting a page that is not aligned to both the logical block size
 	 * and the page size, then we take a copy into a new memory region with
 	 * correct alignment.  This should be impossible on a 512b LBS. On
 	 * larger blocks, this can happen at least when a small number of
 	 * blocks (usually 1) are allocated from a shared slab, or when
 	 * abnormally-small data regions (eg gang headers) are mixed into the
 	 * same ABD as larger allocations (eg aggregations).
 	 */
 	abd_t *abd = zio->io_abd;
 	if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
 		/* Allocate a new memory region with guaranteed alignment */
 		abd = abd_alloc_for_io(zio->io_size,
 		    zio->io_abd->abd_flags & ABD_FLAG_META);
 
 		/* If we're writing copy our data into it */
 		if (zio->io_type == ZIO_TYPE_WRITE)
 			abd_copy(abd, zio->io_abd, zio->io_size);
 
 		/*
 		 * False here would mean the new allocation has an invalid
 		 * alignment too, which would mean that abd_alloc() is not
 		 * guaranteeing this, or our logic in
 		 * vdev_disk_check_alignment() is wrong. In either case,
 		 * something in seriously wrong and its not safe to continue.
 		 */
 		VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
 	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
 	/* Fill it with data pages and submit it to the kernel */
 	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 
 /* ========== */
 
 /*
  * This is the classic, battle-tested BIO submission code. Until we're totally
  * sure that the new code is safe and correct in all cases, this will remain
  * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
  * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
  */
 
 /*
  * Virtual device vector for disks.
  */
 typedef struct dio_request {
 	zio_t			*dr_zio;	/* Parent ZIO */
 	atomic_t		dr_ref;		/* References */
 	int			dr_error;	/* Bio error */
 	int			dr_bio_count;	/* Count of bio's */
 	struct bio		*dr_bio[];	/* Attached bio's */
 } dio_request_t;
 
 static dio_request_t *
 vdev_classic_dio_alloc(int bio_count)
 {
 	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
 	    sizeof (struct bio *) * bio_count, KM_SLEEP);
 	atomic_set(&dr->dr_ref, 0);
 	dr->dr_bio_count = bio_count;
 	dr->dr_error = 0;
 
 	for (int i = 0; i < dr->dr_bio_count; i++)
 		dr->dr_bio[i] = NULL;
 
 	return (dr);
 }
 
 static void
 vdev_classic_dio_free(dio_request_t *dr)
 {
 	int i;
 
 	for (i = 0; i < dr->dr_bio_count; i++)
 		if (dr->dr_bio[i])
 			bio_put(dr->dr_bio[i]);
 
 	kmem_free(dr, sizeof (dio_request_t) +
 	    sizeof (struct bio *) * dr->dr_bio_count);
 }
 
 static void
 vdev_classic_dio_get(dio_request_t *dr)
 {
 	atomic_inc(&dr->dr_ref);
 }
 
 static void
 vdev_classic_dio_put(dio_request_t *dr)
 {
 	int rc = atomic_dec_return(&dr->dr_ref);
 
 	/*
 	 * Free the dio_request when the last reference is dropped and
 	 * ensure zio_interpret is called only once with the correct zio
 	 */
 	if (rc == 0) {
 		zio_t *zio = dr->dr_zio;
 		int error = dr->dr_error;
 
 		vdev_classic_dio_free(dr);
 
 		if (zio) {
 			zio->io_error = error;
 			ASSERT3S(zio->io_error, >=, 0);
 			if (zio->io_error)
 				vdev_disk_error(zio);
 
 			zio_delay_interrupt(zio);
 		}
 	}
 }
 
 static void
 vdev_classic_physio_completion(struct bio *bio)
 {
 	dio_request_t *dr = bio->bi_private;
 
 	if (dr->dr_error == 0) {
 		dr->dr_error = bi_status_to_errno(bio->bi_status);
 	}
 
 	/* Drop reference acquired by vdev_classic_physio */
 	vdev_classic_dio_put(dr);
 }
 
 static inline unsigned int
 vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
 
 #ifdef HAVE_BIO_MAX_SEGS
 	return (bio_max_segs(nr_segs));
 #else
 	return (MIN(nr_segs, BIO_MAX_PAGES));
 #endif
 }
 
 static int
 vdev_classic_physio(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
 	size_t io_size = zio->io_size;
 	uint64_t io_offset = zio->io_offset;
 	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
 	int flags = 0;
 
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
 	int bio_size;
 	int bio_count = 16;
 	int error = 0;
 	struct blk_plug plug;
 	unsigned short nr_vecs;
 
 	/*
 	 * Accessing outside the block device is never allowed.
 	 */
 	if (io_offset + io_size > bdev_capacity(bdev)) {
 		vdev_dbgmsg(zio->io_vd,
 		    "Illegal access %llu size %llu, device size %llu",
 		    (u_longlong_t)io_offset,
 		    (u_longlong_t)io_size,
 		    (u_longlong_t)bdev_capacity(bdev));
 		return (SET_ERROR(EIO));
 	}
 
 retry:
 	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
 		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
 		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
 	}
 
 	dr->dr_zio = zio;
 
 	/*
 	 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
 	 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
 	 * can cover at least 128KB and at most 1MB.  When the required number
 	 * of iovec's exceeds this, we are forced to break the IO in multiple
 	 * bio's and wait for them all to complete.  This is likely if the
 	 * recordsize property is increased beyond 1MB.  The default
 	 * bio_count=16 should typically accommodate the maximum-size zio of
 	 * 16MB.
 	 */
 
 	abd_offset = 0;
 	bio_offset = io_offset;
 	bio_size = io_size;
 	for (int i = 0; i <= dr->dr_bio_count; i++) {
 
 		/* Finished constructing bio's for given buffer */
 		if (bio_size <= 0)
 			break;
 
 		/*
 		 * If additional bio's are required, we have to retry, but
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
 			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
 		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
 			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
 		/* Matching put called by vdev_classic_physio_completion */
 		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
 		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
 		/* Remaining size is returned to become the new size */
 		bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
 		    bio_size, abd_offset);
 
 		/* Advance in buffer and construct another bio if needed */
 		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
 	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
 
 	/* Submit all bio's associated with this dio */
 	for (int i = 0; i < dr->dr_bio_count; i++) {
 		if (dr->dr_bio[i])
 			vdev_submit_bio(dr->dr_bio[i]);
 	}
 
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
 	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
 /* ========== */
 
 static void
 vdev_disk_io_flush_completion(struct bio *bio)
 {
 	zio_t *zio = bio->bi_private;
 	zio->io_error = bi_status_to_errno(bio->bi_status);
 	if (zio->io_error == EOPNOTSUPP || zio->io_error == ENOTTY)
 		zio->io_error = SET_ERROR(ENOTSUP);
 
 	bio_put(bio);
 	ASSERT3S(zio->io_error, >=, 0);
 	if (zio->io_error)
 		vdev_disk_error(zio);
 	zio_interrupt(zio);
 }
 
 static int
 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 {
 	struct request_queue *q;
 	struct bio *bio;
 
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return (SET_ERROR(ENXIO));
 
 	bio = vdev_bio_alloc(bdev, GFP_NOIO, 0);
 	if (unlikely(bio == NULL))
 		return (SET_ERROR(ENOMEM));
 
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio_set_flush(bio);
 	vdev_submit_bio(bio);
 	invalidate_bdev(bdev);
 
 	return (0);
 }
 
 static void
 vdev_disk_discard_end_io(struct bio *bio)
 {
 	zio_t *zio = bio->bi_private;
 	zio->io_error = bi_status_to_errno(bio->bi_status);
 
 	bio_put(bio);
 	if (zio->io_error)
 		vdev_disk_error(zio);
 	zio_interrupt(zio);
 }
 
 /*
  * Wrappers for the different secure erase and discard APIs. We use async
  * when available; in this case, *biop is set to the last bio in the chain.
  */
 static int
 vdev_bdev_issue_secure_erase(zfs_bdev_handle_t *bdh, sector_t sector,
     sector_t nsect, struct bio **biop)
 {
 	*biop = NULL;
 	int error;
 
 #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE)
 	error = blkdev_issue_secure_erase(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, BLKDEV_DISCARD_SECURE);
 #else
 #error "unsupported kernel"
 #endif
 
 	return (error);
 }
 
 static int
 vdev_bdev_issue_discard(zfs_bdev_handle_t *bdh, sector_t sector,
     sector_t nsect, struct bio **biop)
 {
 	*biop = NULL;
 	int error;
 
 #if defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, 0, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS)
 	error = __blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, biop);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS, 0);
 #elif defined(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS)
 	error = blkdev_issue_discard(BDH_BDEV(bdh),
 	    sector, nsect, GFP_NOFS);
 #else
 #error "unsupported kernel"
 #endif
 
 	return (error);
 }
 
 /*
  * Entry point for TRIM ops. This calls the right wrapper for secure erase or
  * discard, and then does the appropriate finishing work for error vs success
  * and async vs sync.
  */
 static int
 vdev_disk_io_trim(zio_t *zio)
 {
 	int error;
 	struct bio *bio;
 
 	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
 	sector_t sector = zio->io_offset >> 9;
 	sector_t nsects = zio->io_size >> 9;
 
 	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
 		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
 	else
 		error = vdev_bdev_issue_discard(bdh, sector, nsects, &bio);
 
 	if (error != 0)
 		return (SET_ERROR(-error));
 
 	if (bio == NULL) {
 		/*
 		 * This was a synchronous op that completed successfully, so
 		 * return it to ZFS immediately.
 		 */
 		zio_interrupt(zio);
 	} else {
 		/*
 		 * This was an asynchronous op; set up completion callback and
 		 * issue it.
 		 */
 		bio->bi_private = zio;
 		bio->bi_end_io = vdev_disk_discard_end_io;
 		vdev_submit_bio(bio);
 	}
 
 	return (0);
 }
 
 int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
 
 static void
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
 	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vd == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	rw_enter(&vd->vd_lock, RW_READER);
 
 	/*
 	 * If the vdev is closed, it's likely due to a failed reopen and is
 	 * in the UNAVAIL state.  Nothing to be done here but return failure.
 	 */
 	if (vd->vd_bdh == NULL) {
 		rw_exit(&vd->vd_lock);
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_FLUSH:
 
 		if (!vdev_readable(v)) {
 			/* Drive not there, can't flush */
 			error = SET_ERROR(ENXIO);
 		} else if (zfs_nocacheflush) {
 			/* Flushing disabled by operator, declare success */
 			error = 0;
 		} else if (v->vdev_nowritecache) {
 			/* This vdev not capable of flushing */
 			error = SET_ERROR(ENOTSUP);
 		} else {
 			/*
 			 * Issue the flush. If successful, the response will
 			 * be handled in the completion callback, so we're done.
 			 */
 			error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio);
 			if (error == 0) {
 				rw_exit(&vd->vd_lock);
 				return;
 			}
 		}
 
 		/* Couldn't issue the flush, so set the error and return it */
 		rw_exit(&vd->vd_lock);
 		zio->io_error = error;
 		zio_execute(zio);
 		return;
 
 	case ZIO_TYPE_TRIM:
 		error = vdev_disk_io_trim(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
 			zio_execute(zio);
 		}
 		return;
 
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
 		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
 			zio_interrupt(zio);
 		}
 		return;
 
 	default:
 		/*
 		 * Getting here means our parent vdev has made a very strange
 		 * request of us, and shouldn't happen. Assert here to force a
 		 * crash in dev builds, but in production return the IO
 		 * unhandled. The pool will likely suspend anyway but that's
 		 * nicer than crashing the kernel.
 		 */
 		ASSERT3S(zio->io_type, ==, -1);
 
 		rw_exit(&vd->vd_lock);
 		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
 
 	__builtin_unreachable();
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {
 	/* If this was a read or write, we need to clean up the vbio */
 	if (zio->io_bio != NULL) {
 		vbio_t *vbio = zio->io_bio;
 		zio->io_bio = NULL;
 
 		/*
 		 * If we copied the ABD before issuing it, clean up and return
 		 * the copy to the ADB, with changes if appropriate.
 		 */
 		if (vbio->vbio_abd != NULL) {
 			if (zio->io_type == ZIO_TYPE_READ)
 				abd_copy(zio->io_abd, vbio->vbio_abd,
 				    zio->io_size);
 
 			abd_free(vbio->vbio_abd);
 			vbio->vbio_abd = NULL;
 		}
 
 		/* Final cleanup */
 		kmem_free(vbio, sizeof (vbio_t));
 	}
 
 	/*
 	 * If the device returned EIO, we revalidate the media.  If it is
 	 * determined the media has changed this triggers the asynchronous
 	 * removal of the device from the configuration.
 	 */
 	if (zio->io_error == EIO) {
 		vdev_t *v = zio->io_vd;
 		vdev_disk_t *vd = v->vdev_tsd;
 
 		if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
 			invalidate_bdev(BDH_BDEV(vd->vd_bdh));
 			v->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
 	}
 }
 
 static void
 vdev_disk_hold(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* We must have a pathname, and it must be absolute. */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
 		return;
 
 	/*
 	 * Only prefetch path and devid info if the device has
 	 * never been opened.
 	 */
 	if (vd->vdev_tsd != NULL)
 		return;
 
 }
 
 static void
 vdev_disk_rele(vdev_t *vd)
 {
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
 
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
 /*
  * BIO submission method. See comment above about vdev_classic.
  * Set zfs_vdev_disk_classic=0 for new, =1 for classic
  */
 static uint_t zfs_vdev_disk_classic = 0;	/* default new */
 
 /* Set submission function from module parameter */
 static int
 vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
 {
 	int err = param_set_uint(buf, kp);
 	if (err < 0)
 		return (SET_ERROR(err));
 
 	vdev_disk_io_rw_fn =
 	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
 
 	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
 	    zfs_vdev_disk_classic ? "classic" : "new");
 
 	return (0);
 }
 
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
  */
 static int
 vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	(void) spa;
 	(void) nv;
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
 		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
 		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_disk_io_start,
 	.vdev_op_io_done = vdev_disk_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_disk_hold,
 	.vdev_op_rele = vdev_disk_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE,			/* leaf vdev */
 	.vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post
 };
 
 /*
  * The zfs_vdev_scheduler module option has been deprecated. Setting this
  * value no longer has any effect.  It has not yet been entirely removed
  * to allow the module to be loaded if this option is specified in the
  * /etc/modprobe.d/zfs.conf file.  The following warning will be logged.
  */
 static int
 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
 {
 	int error = param_set_charp(val, kp);
 	if (error == 0) {
 		printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
 		    "is not supported.\n");
 	}
 
 	return (error);
 }
 
 static const char *zfs_vdev_scheduler = "unused";
 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
     param_get_charp, &zfs_vdev_scheduler, 0644);
 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
 
 int
 param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint_t val;
 	int error;
 
 	error = kstrtouint(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_uint(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 int
 param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
 {
 	uint_t val;
 	int error;
 
 	error = kstrtouint(buf, 0, &val);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(-EINVAL));
 
 	error = param_set_uint(buf, kp);
 	if (error < 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 	"Timeout before determining that a device is missing");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
     vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
 	"Use classic BIO submission method");
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 8e7138984e2e..58f0975e166f 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -1,6285 +1,6395 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/brt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
 #include <sys/btree.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 /*
  * Metaslab group's per child vdev granularity, in bytes.  This is roughly
  * similar to what would be referred to as the "stripe size" in traditional
  * RAID arrays. In normal operation, we will try to write this amount of
  * data to each disk before moving on to the next top-level vdev.
  */
 static uint64_t metaslab_aliquot = 2 * 1024 * 1024;
 
 /*
  * For testing, make some blocks above a certain size be gang blocks.
  */
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
 
 /*
  * Of blocks of size >= metaslab_force_ganging, actually gang them this often.
  */
 uint_t metaslab_force_ganging_pct = 3;
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 uint_t zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 static const int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 static uint_t zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmentation metric (measured as a percentage) is less than or
  * equal to zfs_mg_fragmentation_threshold. If a metaslab group
  * exceeds this threshold then it will be skipped unless all metaslab
  * groups within the metaslab class have also crossed this threshold.
  *
  * This tunable was introduced to avoid edge cases where we continue
  * allocating from very fragmented disks in our pool while other, less
  * fragmented disks, exists. On the other hand, if all disks in the
  * pool are uniformly approaching the threshold, the threshold can
  * be a speed bump in performance, where we keep switching the disks
  * that we allocate from (e.g. we allocate some segments from disk A
  * making it bypassing the threshold while freeing segments from disk
  * B getting its fragmentation below the threshold).
  *
  * Empirically, we've seen that our vdev selection for allocations is
  * good enough that fragmentation increases uniformly across all vdevs
  * the majority of the time. Thus we set the threshold percentage high
  * enough to avoid hitting the speed bump on pools that are being pushed
  * to the edge.
  */
 static uint_t zfs_mg_fragmentation_threshold = 95;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 static uint_t zfs_metaslab_fragmentation_threshold = 77;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = B_FALSE;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 static int metaslab_debug_unload = B_FALSE;
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 uint_t metaslab_df_free_pct = 4;
 
 /*
  * Maximum distance to search forward from the last offset. Without this
  * limit, fragmented pools can see >100,000 iterations and
  * metaslab_block_picker() becomes the performance limiting factor on
  * high-performance storage.
  *
  * With the default setting of 16MB, we typically see less than 500
  * iterations, even with very fragmented, ashift=9 pools. The maximum number
  * of iterations possible is:
  *     metaslab_df_max_search / (2 * (1<<ashift))
  * With the default setting of 16MB this is 16*1024 (with ashift=9) or
  * 2048 (with ashift=12).
  */
 static uint_t metaslab_df_max_search = 16 * 1024 * 1024;
 
 /*
  * Forces the metaslab_block_picker function to search for at least this many
  * segments forwards until giving up on finding a segment that the allocation
  * will fit into.
  */
 static const uint32_t metaslab_min_search_count = 100;
 
 /*
  * If we are not searching forward (due to metaslab_df_max_search,
  * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
  * controls what segment is used.  If it is set, we will use the largest free
  * segment.  If it is not set, we will use a segment of exactly the requested
  * size (or larger).
  */
 static int metaslab_df_use_largest_segment = B_FALSE;
 
 /*
  * These tunables control how long a metaslab will remain loaded after the
  * last allocation from it.  A metaslab can't be unloaded until at least
  * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
  * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
  * unloaded sooner.  These settings are intended to be generous -- to keep
  * metaslabs loaded for a long time, reducing the rate of metaslab loading.
  */
 static uint_t metaslab_unload_delay = 32;
 static uint_t metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
 
 /*
  * Max number of metaslabs per group to preload.
  */
 uint_t metaslab_preload_limit = 10;
 
 /*
  * Enable/disable preloading of metaslab.
  */
 static int metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 static int metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 static int metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable space-based metaslab group biasing.
  */
 static int metaslab_bias_enabled = B_TRUE;
 
 /*
  * Control performance-based metaslab group biasing.
  */
 static int metaslab_perf_bias = 1;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
 static const boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
 static int zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
 static int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
 static const boolean_t metaslab_trace_enabled = B_FALSE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
  * in a given list when running in non-debug mode. We limit the number
  * of entries in non-debug mode to prevent us from using up too much memory.
  * The limit should be sufficiently large that we don't expect any allocation
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
 static const uint64_t metaslab_trace_max_entries = 5000;
 
 /*
  * Maximum number of metaslabs per group that can be disabled
  * simultaneously.
  */
 static const int max_disabled_ms = 3;
 
 /*
  * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
  * To avoid 64-bit overflow, don't set above UINT32_MAX.
  */
 static uint64_t zfs_metaslab_max_size_cache_sec = 1 * 60 * 60; /* 1 hour */
 
 /*
  * Maximum percentage of memory to use on storing loaded metaslabs. If loading
  * a metaslab would take it over this percentage, the oldest selected metaslab
  * is automatically unloaded.
  */
 static uint_t zfs_metaslab_mem_limit = 25;
 
 /*
  * Force the per-metaslab range trees to use 64-bit integers to store
  * segments. Used for debugging purposes.
  */
 static const boolean_t zfs_metaslab_force_large_segs = B_FALSE;
 
 /*
  * By default we only store segments over a certain size in the size-sorted
  * metaslab trees (ms_allocatable_by_size and
  * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
  * improves load and unload times at the cost of causing us to use slightly
  * larger segments than we would otherwise in some cases.
  */
 static const uint32_t metaslab_by_size_min_shift = 14;
 
 /*
  * If not set, we will first try normal allocation.  If that fails then
  * we will do a gang allocation.  If that fails then we will do a "try hard"
  * gang allocation.  If that fails then we will have a multi-layer gang
  * block.
  *
  * If set, we will first try normal allocation.  If that fails then
  * we will do a "try hard" allocation.  If that fails we will do a gang
  * allocation.  If that fails we will do a "try hard" gang allocation.  If
  * that fails then we will have a multi-layer gang block.
  */
 static int zfs_metaslab_try_hard_before_gang = B_FALSE;
 
 /*
  * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
  * metaslabs.  This improves performance, especially when there are many
  * metaslabs per vdev and the allocation can't actually be satisfied (so we
  * would otherwise iterate all the metaslabs).  If there is a metaslab with a
  * worse weight but it can actually satisfy the allocation, we won't find it
  * until trying hard.  This may happen if the worse metaslab is not loaded
  * (and the true weight is better than we have calculated), or due to weight
  * bucketization.  E.g. we are looking for a 60K segment, and the best
  * metaslabs all have free segments in the 32-63K bucket, but the best
  * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
  * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
  * bucket, and therefore a lower weight).
  */
 static uint_t zfs_metaslab_find_max_tries = 100;
 
 static uint64_t metaslab_weight(metaslab_t *, boolean_t);
 static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
 static unsigned int metaslab_idx_func(multilist_t *, void *);
 static void metaslab_evict(metaslab_t *, uint64_t);
 static void metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs,
     void *arg);
 kmem_cache_t *metaslab_alloc_trace_cache;
 
 typedef struct metaslab_stats {
 	kstat_named_t metaslabstat_trace_over_limit;
 	kstat_named_t metaslabstat_reload_tree;
 	kstat_named_t metaslabstat_too_many_tries;
 	kstat_named_t metaslabstat_try_hard;
 } metaslab_stats_t;
 
 static metaslab_stats_t metaslab_stats = {
 	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
 	{ "reload_tree",		KSTAT_DATA_UINT64 },
 	{ "too_many_tries",		KSTAT_DATA_UINT64 },
 	{ "try_hard",			KSTAT_DATA_UINT64 },
 };
 
 #define	METASLABSTAT_BUMP(stat) \
 	atomic_inc_64(&metaslab_stats.stat.value.ui64);
 
 
 static kstat_t *metaslab_ksp;
 
 void
 metaslab_stat_init(void)
 {
 	ASSERT(metaslab_alloc_trace_cache == NULL);
 	metaslab_alloc_trace_cache = kmem_cache_create(
 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
 	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (metaslab_ksp != NULL) {
 		metaslab_ksp->ks_data = &metaslab_stats;
 		kstat_install(metaslab_ksp);
 	}
 }
 
 void
 metaslab_stat_fini(void)
 {
 	if (metaslab_ksp != NULL) {
 		kstat_delete(metaslab_ksp);
 		metaslab_ksp = NULL;
 	}
 
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, const metaslab_ops_t *ops, boolean_t is_log)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_ops = ops;
 	mc->mc_is_log = is_log;
 	mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE;
 	mc->mc_alloc_max = UINT64_MAX;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t),
 	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL);
 		avl_create(&mca->mca_tree, zio_bookmark_compare,
 		    sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
 		mca->mca_rotor = NULL;
 		mca->mca_reserved = 0;
 	}
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		avl_destroy(&mca->mca_tree);
 		mutex_destroy(&mca->mca_lock);
 		ASSERT(mca->mca_rotor == NULL);
 		ASSERT0(mca->mca_reserved);
 	}
 	mutex_destroy(&mc->mc_lock);
 	multilist_destroy(&mc->mc_metaslab_txg_list);
 	kmem_free(mc, offsetof(metaslab_class_t,
 	    mc_allocator[spa->spa_alloc_count]));
 }
 
 void
 metaslab_class_validate(metaslab_class_t *mc)
 {
 #ifdef ZFS_DEBUG
 	spa_t *spa = mc->mc_spa;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(spa, SCL_ALL, RW_WRITER));
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
 		metaslab_group_t *mg, *rotor;
 
 		ASSERT0(avl_numnodes(&mca->mca_tree));
 		ASSERT0(mca->mca_reserved);
 
 		if ((mg = rotor = mca->mca_rotor) == NULL)
 			continue;
 		do {
 			metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 			vdev_t *vd = mg->mg_vd;
 
 			ASSERT3P(vd->vdev_top, ==, vd);
 			ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg);
 			ASSERT3P(mg->mg_class, ==, mc);
 			ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 			ASSERT0(zfs_refcount_count(&mga->mga_queue_depth));
 		} while ((mg = mg->mg_next) != rotor);
 	}
 #endif
 }
 
 /*
  * For each metaslab group in a class pre-calculate allocation quota and
  * target queue depth to balance their space usage and write performance.
  * Based on those pre-calculate class allocation throttle threshold for
  * optimal saturation.  onsync is true once per TXG to enable/disable
  * allocation throttling and update moving average of maximum I/O size.
  */
 void
 metaslab_class_balance(metaslab_class_t *mc, boolean_t onsync)
 {
 	metaslab_group_t *mg, *first;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if (onsync)
 		metaslab_class_validate(mc);
 
 	if (mc->mc_groups == 0) {
 		if (onsync)
 			mc->mc_alloc_throttle_enabled = B_FALSE;
 		mc->mc_alloc_max = UINT64_MAX;
 		return;
 	}
 
 	if (onsync) {
 		/*
 		 * Moving average of maximum allocation size, in absence of
 		 * large allocations shrinking to 1/8 of metaslab_aliquot.
 		 */
 		mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size +
 		    metaslab_aliquot / 8) / 4;
 		mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 :
 		    zio_dva_throttle_enabled;
 	}
 
 	mg = first = mc->mc_allocator[0].mca_rotor;
 	uint64_t children = 0;
 	do {
 		children += vdev_get_ndisks(mg->mg_vd) -
 		    vdev_get_nparity(mg->mg_vd);
 	} while ((mg = mg->mg_next) != first);
 
 	uint64_t sum_aliquot = 0;
 	do {
 		vdev_stat_t *vs = &mg->mg_vd->vdev_stat;
 		uint_t ratio;
 
 		/*
 		 * Scale allocations per iteration with average number of
 		 * children.  Wider vdevs need more sequential allocations
 		 * to keep decent per-child I/O size.
 		 */
 		uint64_t mg_aliquot = MAX(metaslab_aliquot * children /
 		    mc->mc_groups, mc->mc_alloc_io_size * 4);
 
 		/*
 		 * Scale allocations per iteration with the vdev capacity,
 		 * relative to average.  Bigger vdevs should get more to
 		 * fill up at the same time as smaller ones.
 		 */
 		if (mc->mc_space > 0 && vs->vs_space > 0) {
 			ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups *
 			    256) + 1);
 			mg_aliquot = mg_aliquot * ratio / 256;
 		}
 
 		/*
 		 * Scale allocations per iteration with the vdev's free space
 		 * fraction, relative to average. Despite the above, vdevs free
 		 * space fractions may get imbalanced, for example due to new
 		 * vdev addition or different performance.  We want free space
 		 * fractions to be similar to postpone fragmentation.
 		 *
 		 * But same time we don't want to throttle vdevs still having
 		 * plenty of free space, that appear faster than others, even
 		 * if that cause temporary imbalance.  Allow them to allocate
 		 * more by keeping their allocation queue depth equivalent to
 		 * 2.5 full iteration, even if they repeatedly drain it. Later
 		 * with the free space reduction gradually reduce the target
 		 * queue depth, stronger enforcing the free space balance.
 		 */
 		if (metaslab_bias_enabled &&
 		    mc->mc_space > 0 && vs->vs_space > 0) {
 			uint64_t vs_free = vs->vs_space > vs->vs_alloc ?
 			    vs->vs_space - vs->vs_alloc : 0;
 			uint64_t mc_free = mc->mc_space > mc->mc_alloc ?
 			    mc->mc_space - mc->mc_alloc : 0;
 			/*
 			 * vs_fr is 16 bit fixed-point free space fraction.
 			 * mc_fr is 8 bit fixed-point free space fraction.
 			 * ratio as their quotient is 8 bit fixed-point.
 			 */
 			uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1);
 			uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1);
 			ratio = vs_fr / (mc_fr + 1);
 			mg->mg_aliquot = mg_aliquot * ratio / 256;
 			/* From 2.5x at 25% full to 1x at 75%. */
 			ratio = MIN(163840, vs_fr * 3 + 16384);
 			mg->mg_queue_target = MAX(mg->mg_aliquot,
 			    mg->mg_aliquot * ratio / 65536);
 		} else {
 			mg->mg_aliquot = mg_aliquot;
 			mg->mg_queue_target = mg->mg_aliquot * 2;
 		}
 		sum_aliquot += mg->mg_aliquot;
 	} while ((mg = mg->mg_next) != first);
 
 	/*
 	 * Set per-class allocation throttle threshold to 4 iterations through
 	 * all the vdevs.  This should keep all vdevs busy even if some are
 	 * allocating more than we planned for them due to bigger blocks or
 	 * better performance.
 	 */
 	mc->mc_alloc_max = sum_aliquot * 4;
 }
 
 static void
 metaslab_class_rotate(metaslab_group_t *mg, int allocator, uint64_t psize,
     boolean_t success)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	/*
 	 * Exit fast if there is nothing to rotate, we are not following
 	 * the rotor (copies, gangs, etc) or somebody already rotated it.
 	 */
 	if (mc->mc_groups < 2 || mca->mca_rotor != mg)
 		return;
 
 	/*
 	 * Always rotate in case of allocation error or a log class.
 	 */
 	if (!success || mc->mc_is_log)
 		goto rotate;
 
 	/*
 	 * Allocate from this group if we expect next I/O of the same size to
 	 * mostly fit within the allocation quota.  Rotate if we expect it to
 	 * mostly go over the target queue depth.  Meanwhile, to stripe between
 	 * groups in configured amounts per child even if we can't reach the
 	 * target queue depth, i.e. can't saturate the group write performance,
 	 * always rotate after allocating the queue target bytes.
 	 */
 	uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2;
 	if (naq < mg->mg_aliquot)
 		return;
 	if (naq >= mg->mg_queue_target)
 		goto rotate;
 	if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >=
 	    mg->mg_queue_target)
 		goto rotate;
 
 	/*
 	 * When the pool is not too busy, prefer restoring the vdev free space
 	 * balance instead of getting maximum speed we might not need, so that
 	 * we could have more flexibility during more busy times later.
 	 */
 	if (metaslab_perf_bias <= 0)
 		goto rotate;
 	if (metaslab_perf_bias >= 2)
 		return;
 	spa_t *spa = mc->mc_spa;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	if (dp == NULL)
 		return;
 	uint64_t busy_thresh = zfs_dirty_data_max *
 	    (zfs_vdev_async_write_active_min_dirty_percent +
 	    zfs_vdev_async_write_active_max_dirty_percent) / 200;
 	if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa))
 		return;
 
 rotate:
 	mca->mca_rotor = mg->mg_next;
 	mca->mca_aliquot = 0;
 }
 
 static void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	mutex_enter(&mc->mc_lock);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
 	for (i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 	}
 
 	mutex_exit(&mc->mc_lock);
 	kmem_free(mc_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels,
 		 * or vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * Calculate if we have enough space to add additional
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 */
 		space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize,
 		    1ULL << tvd->vdev_ms_shift, uint64_t);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 void
 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
 	multilist_t *ml = &mc->mc_metaslab_txg_list;
 	uint64_t now = gethrestime_sec();
 	/* Round delay up to next second. */
 	uint_t delay = (metaslab_unload_delay_ms + 999) / 1000;
 	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 
 			/*
 			 * If the metaslab has been removed from the list
 			 * (which could happen if we were at the memory limit
 			 * and it was evicted during this loop), then we can't
 			 * proceed and we should restart the sublist.
 			 */
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				mutex_exit(&msp->ms_lock);
 				i--;
 				break;
 			}
 			mls = multilist_sublist_lock_idx(ml, i);
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			if (txg >
 			    msp->ms_selected_txg + metaslab_unload_delay &&
 			    now > msp->ms_selected_time + delay &&
 			    (msp->ms_allocator == -1 ||
 			    !metaslab_preload_enabled)) {
 				metaslab_evict(msp, txg);
 			} else {
 				/*
 				 * Once we've hit a metaslab selected too
 				 * recently to evict, we're done evicting for
 				 * now.
 				 */
 				mutex_exit(&msp->ms_lock);
 				break;
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 		}
 	}
 }
 
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = (const metaslab_t *)x1;
 	const metaslab_t *m2 = (const metaslab_t *)x2;
 
 	int sort1 = 0;
 	int sort2 = 0;
 	if (m1->ms_allocator != -1 && m1->ms_primary)
 		sort1 = 1;
 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
 		sort1 = 2;
 	if (m2->ms_allocator != -1 && m2->ms_primary)
 		sort2 = 1;
 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
 		sort2 = 2;
 
 	/*
 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
 	 * selecting a metaslab to allocate from, an allocator first tries its
 	 * primary, then secondary active metaslab. If it doesn't have active
 	 * metaslabs, or can't allocate from them, it searches for an inactive
 	 * metaslab to activate. If it can't find a suitable one, it will steal
 	 * a primary or secondary metaslab from another allocator.
 	 */
 	if (sort1 < sort2)
 		return (-1);
 	if (sort1 > sort2)
 		return (1);
 
 	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
 	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 
 	return (TREE_CMP(m1->ms_start, m2->ms_start));
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  * transitions from allocatable to non-allocatable or vice versa then the
  * metaslab group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 	boolean_t was_initialized;
 
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 	    SCL_ALLOC);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 	was_initialized = mg->mg_initialized;
 
 	uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	mutex_enter(&mc->mc_lock);
 
 	/*
 	 * If the metaslab group was just added then it won't
 	 * have any space until we finish syncing out this txg.
 	 * At that point we will consider it initialized and available
 	 * for allocations.  We also don't consider non-activated
 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
 	 * to be initialized, because they can't be used for allocation.
 	 */
 	mg->mg_initialized = metaslab_group_initialized(mg);
 	if (!was_initialized && mg->mg_initialized) {
 		mc->mc_groups++;
 	} else if (was_initialized && !mg->mg_initialized) {
 		ASSERT3U(mc->mc_groups, >, 0);
 		mc->mc_groups--;
 	}
 	if (mg->mg_initialized)
 		mg->mg_no_free_space = B_FALSE;
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 	    free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 	mutex_exit(&mc->mc_lock);
 
 	mutex_exit(&mg->mg_lock);
 }
 
 int
 metaslab_sort_by_flushed(const void *va, const void *vb)
 {
 	const metaslab_t *a = va;
 	const metaslab_t *b = vb;
 
 	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
 	if (likely(cmp))
 		return (cmp);
 
 	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
 	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
 	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
 	if (cmp)
 		return (cmp);
 
 	return (TREE_CMP(a->ms_id, b->ms_id));
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(offsetof(metaslab_group_t,
 	    mg_allocator[spa->spa_alloc_count]), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_create_tracked(&mga->mga_queue_depth);
 	}
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
 	cv_destroy(&mg->mg_ms_disabled_cv);
 
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		zfs_refcount_destroy(&mga->mga_queue_depth);
 	}
 	kmem_free(mg, offsetof(metaslab_group_t,
 	    mg_allocator[spa->spa_alloc_count]));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		mc->mc_allocator[i].mca_rotor = mg;
 		mg = mg->mg_next;
 	}
 	metaslab_class_balance(mc, B_FALSE);
 }
 
 /*
  * Passivate a metaslab group and remove it from the allocation rotor.
  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  * a metaslab group. This function will momentarily drop spa_config_locks
  * that are lower than the SCL_ALLOC lock (see comment below).
  */
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
 		for (int i = 0; i < spa->spa_alloc_count; i++)
 			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	/*
 	 * The spa_config_lock is an array of rwlocks, ordered as
 	 * follows (from highest to lowest):
 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
 	 * (For more information about the spa_config_lock see spa_misc.c)
 	 * The higher the lock, the broader its coverage. When we passivate
 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 	 * config locks. However, the metaslab group's taskq might be trying
 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
 	 * lower locks to allow the I/O to complete. At a minimum,
 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
 		metaslab_t *msp = mga->mga_primary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mga->mga_secondary;
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mgnext = NULL;
 	} else {
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 	for (int i = 0; i < spa->spa_alloc_count; i++) {
 		if (mc->mc_allocator[i].mca_rotor == mg)
 			mc->mc_allocator[i].mca_rotor = mgnext;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 	metaslab_class_balance(mc, B_FALSE);
 }
 
 boolean_t
 metaslab_group_initialized(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	/*
 	 * Note that the number of nodes in mg_metaslab_tree may be one less
 	 * than vdev_ms_count, due to the embedded log metaslab.
 	 */
 	mutex_enter(&mg->mg_lock);
 	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
 	mutex_exit(&mg->mg_lock);
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mg_hist = kmem_zalloc(sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	ASSERT3U(ZFS_RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	mutex_enter(&mg->mg_lock);
 	for (metaslab_t *msp = avl_first(t);
 	    msp != NULL; msp = AVL_NEXT(t, msp)) {
 		VERIFY3P(msp->ms_group, ==, mg);
 		/* skip if not active */
 		if (msp->ms_sm == NULL)
 			continue;
 
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 		}
 	}
 
 	for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	mutex_exit(&mg->mg_lock);
 
 	kmem_free(mg_hist, sizeof (uint64_t) * ZFS_RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	mutex_enter(&mc->mc_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		IMPLY(mg == mg->mg_vd->vdev_log_mg,
 		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mc->mc_lock);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	multilist_sublist_unlock(mls);
 
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
 
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group.  Weight metaslabs
  * on the amount of free space.  The return value will be between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0, total_ms = 0;
 	uint64_t free, total_free = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_group != mg)
 			continue;
 		total_ms++;
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 
 		valid_ms++;
 		free = (msp->ms_size - metaslab_allocated_space(msp)) /
 		    SPA_MINBLOCKSIZE;  /* To prevent overflows. */
 		total_free += free;
 		fragmentation += msp->ms_fragmentation * free;
 	}
 
 	if (valid_ms < (total_ms + 1) / 2 || total_free == 0)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= total_free;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree using 32-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize32_compare(const void *x1, const void *x2)
 {
 	const zfs_range_seg32_t *r1 = x1;
 	const zfs_range_seg32_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * Comparison function for the private size-ordered tree using 64-bit
  * ranges. Tree is sorted by size, larger sizes at the end of the tree.
  */
 __attribute__((always_inline)) inline
 static int
 metaslab_rangesize64_compare(const void *x1, const void *x2)
 {
 	const zfs_range_seg64_t *r1 = x1;
 	const zfs_range_seg64_t *r2 = x2;
 
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = TREE_CMP(rs_size1, rs_size2);
 
 	return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
 }
 
 typedef struct metaslab_rt_arg {
 	zfs_btree_t *mra_bt;
 	uint32_t mra_floor_shift;
 } metaslab_rt_arg_t;
 
 struct mssa_arg {
 	zfs_range_tree_t *rt;
 	metaslab_rt_arg_t *mra;
 };
 
 static void
 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
 {
 	struct mssa_arg *mssap = arg;
 	zfs_range_tree_t *rt = mssap->rt;
 	metaslab_rt_arg_t *mrap = mssap->mra;
 	zfs_range_seg_max_t seg = {0};
 	zfs_rs_set_start(&seg, rt, start);
 	zfs_rs_set_end(&seg, rt, start + size);
 	metaslab_rt_add(rt, &seg, mrap);
 }
 
 static void
 metaslab_size_tree_full_load(zfs_range_tree_t *rt)
 {
 	metaslab_rt_arg_t *mrap = rt->rt_arg;
 	METASLABSTAT_BUMP(metaslabstat_reload_tree);
 	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
 	mrap->mra_floor_shift = 0;
 	struct mssa_arg arg = {0};
 	arg.rt = rt;
 	arg.mra = mrap;
 	zfs_range_tree_walk(rt, metaslab_size_sorted_add, &arg);
 }
 
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
     zfs_range_seg32_t, metaslab_rangesize32_compare)
 
 ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
     zfs_range_seg64_t, metaslab_rangesize64_compare)
 
 /*
  * Create any block allocator specific components. The current allocators
  * rely on using both a size-ordered zfs_range_tree_t and an array of
  * uint64_t's.
  */
 static void
 metaslab_rt_create(zfs_range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	size_t size;
 	int (*compare) (const void *, const void *);
 	bt_find_in_buf_f bt_find;
 	switch (rt->rt_type) {
 	case ZFS_RANGE_SEG32:
 		size = sizeof (zfs_range_seg32_t);
 		compare = metaslab_rangesize32_compare;
 		bt_find = metaslab_rt_find_rangesize32_in_buf;
 		break;
 	case ZFS_RANGE_SEG64:
 		size = sizeof (zfs_range_seg64_t);
 		compare = metaslab_rangesize64_compare;
 		bt_find = metaslab_rt_find_rangesize64_in_buf;
 		break;
 	default:
 		panic("Invalid range seg type %d", rt->rt_type);
 	}
 	zfs_btree_create(size_tree, compare, bt_find, size);
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 }
 
 static void
 metaslab_rt_destroy(zfs_range_tree_t *rt, void *arg)
 {
 	(void) rt;
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	zfs_btree_destroy(size_tree);
 	kmem_free(mrap, sizeof (*mrap));
 }
 
 static void
 metaslab_rt_add(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) <
 	    (1ULL << mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_add(size_tree, rs);
 }
 
 static void
 metaslab_rt_remove(zfs_range_tree_t *rt, zfs_range_seg_t *rs, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 
 	if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL <<
 	    mrap->mra_floor_shift))
 		return;
 
 	zfs_btree_remove(size_tree, rs);
 }
 
 static void
 metaslab_rt_vacate(zfs_range_tree_t *rt, void *arg)
 {
 	metaslab_rt_arg_t *mrap = arg;
 	zfs_btree_t *size_tree = mrap->mra_bt;
 	zfs_btree_clear(size_tree);
 	zfs_btree_destroy(size_tree);
 
 	metaslab_rt_create(rt, arg);
 }
 
 static const zfs_range_tree_ops_t metaslab_rt_ops = {
 	.rtop_create = metaslab_rt_create,
 	.rtop_destroy = metaslab_rt_destroy,
 	.rtop_add = metaslab_rt_add,
 	.rtop_remove = metaslab_rt_remove,
 	.rtop_vacate = metaslab_rt_vacate
 };
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_largest_allocatable(metaslab_t *msp)
 {
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	zfs_range_seg_t *rs;
 
 	if (t == NULL)
 		return (0);
 	if (zfs_btree_numnodes(t) == 0)
 		metaslab_size_tree_full_load(msp->ms_allocatable);
 
 	rs = zfs_btree_last(t, NULL);
 	if (rs == NULL)
 		return (0);
 
 	return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs,
 	    msp->ms_allocatable));
 }
 
 /*
  * Return the maximum contiguous segment within the unflushed frees of this
  * metaslab.
  */
 static uint64_t
 metaslab_largest_unflushed_free(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if (msp->ms_unflushed_frees == NULL)
 		return (0);
 
 	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
 		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
 	zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
 	    NULL);
 	if (rs == NULL)
 		return (0);
 
 	/*
 	 * When a range is freed from the metaslab, that range is added to
 	 * both the unflushed frees and the deferred frees. While the block
 	 * will eventually be usable, if the metaslab were loaded the range
 	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
 	 * txgs had passed.  As a result, when attempting to estimate an upper
 	 * bound for the largest currently-usable free segment in the
 	 * metaslab, we need to not consider any ranges currently in the defer
 	 * trees. This algorithm approximates the largest available chunk in
 	 * the largest range in the unflushed_frees tree by taking the first
 	 * chunk.  While this may be a poor estimate, it should only remain so
 	 * briefly and should eventually self-correct as frees are no longer
 	 * deferred. Similar logic applies to the ms_freed tree. See
 	 * metaslab_load() for more details.
 	 *
 	 * There are two primary sources of inaccuracy in this estimate. Both
 	 * are tolerated for performance reasons. The first source is that we
 	 * only check the largest segment for overlaps. Smaller segments may
 	 * have more favorable overlaps with the other trees, resulting in
 	 * larger usable chunks.  Second, we only look at the first chunk in
 	 * the largest segment; there may be other usable chunks in the
 	 * largest segment, but we ignore them.
 	 */
 	uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees);
 	uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		uint64_t start = 0;
 		uint64_t size = 0;
 		boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t],
 		    rstart, rsize, &start, &size);
 		if (found) {
 			if (rstart == start)
 				return (0);
 			rsize = start - rstart;
 		}
 	}
 
 	uint64_t start = 0;
 	uint64_t size = 0;
 	boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart,
 	    rsize, &start, &size);
 	if (found)
 		rsize = start - rstart;
 
 	return (rsize);
 }
 
 static zfs_range_seg_t *
 metaslab_block_find(zfs_btree_t *t, zfs_range_tree_t *rt, uint64_t start,
-    uint64_t size, zfs_btree_index_t *where)
+    uint64_t size, uint64_t max_size, zfs_btree_index_t *where)
 {
 	zfs_range_seg_t *rs;
 	zfs_range_seg_max_t rsearch;
 
 	zfs_rs_set_start(&rsearch, rt, start);
-	zfs_rs_set_end(&rsearch, rt, start + size);
+	zfs_rs_set_end(&rsearch, rt, start + max_size);
 
 	rs = zfs_btree_find(t, &rsearch, where);
 	if (rs == NULL) {
-		rs = zfs_btree_next(t, where, where);
+		if (size == max_size) {
+			rs = zfs_btree_next(t, where, where);
+		} else {
+			/*
+			 * If we're searching for a range, get the largest
+			 * segment in that range, or the smallest one bigger
+			 * than it.
+			 */
+			rs = zfs_btree_prev(t, where, where);
+			if (rs == NULL || zfs_rs_get_end(rs, rt) -
+			    zfs_rs_get_start(rs, rt) < size) {
+				rs = zfs_btree_next(t, where, where);
+			}
+		}
 	}
 
 	return (rs);
 }
 
 /*
  * This is a helper function that can be used by the allocator to find a
  * suitable block to allocate. This will search the specified B-tree looking
  * for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(zfs_range_tree_t *rt, uint64_t *cursor, uint64_t size,
-    uint64_t max_search)
+    uint64_t max_size, uint64_t max_search, uint64_t *found_size)
 {
 	if (*cursor == 0)
 		*cursor = rt->rt_start;
 	zfs_btree_t *bt = &rt->rt_root;
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size,
-	    &where);
+	    max_size, &where);
 	uint64_t first_found;
 	int count_searched = 0;
 
 	if (rs != NULL)
 		first_found = zfs_rs_get_start(rs, rt);
 
 	while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <=
 	    max_search || count_searched < metaslab_min_search_count)) {
 		uint64_t offset = zfs_rs_get_start(rs, rt);
 		if (offset + size <= zfs_rs_get_end(rs, rt)) {
-			*cursor = offset + size;
+			*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
+			    max_size);
+			*cursor = offset + *found_size;
 			return (offset);
 		}
 		rs = zfs_btree_next(bt, &where, &where);
 		count_searched++;
 	}
 
 	*cursor = 0;
+	*found_size = 0;
 	return (-1ULL);
 }
 
-static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size);
-static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size);
-static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size);
+static uint64_t metaslab_df_alloc(metaslab_t *msp, uint64_t size,
+    uint64_t max_size, uint64_t *found_size);
+static uint64_t metaslab_cf_alloc(metaslab_t *msp, uint64_t size,
+    uint64_t max_size, uint64_t *found_size);
+static uint64_t metaslab_ndf_alloc(metaslab_t *msp, uint64_t size,
+    uint64_t max_size, uint64_t *found_size);
 metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 static metaslab_ops_t metaslab_allocators[] = {
 	{ "dynamic", metaslab_df_alloc },
 	{ "cursor", metaslab_cf_alloc },
 	{ "new-dynamic", metaslab_ndf_alloc },
 };
 
 static int
 spa_find_allocator_byname(const char *val)
 {
 	int a = ARRAY_SIZE(metaslab_allocators) - 1;
 	if (strcmp("new-dynamic", val) == 0)
 		return (-1); /* remove when ndf is working */
 	for (; a >= 0; a--) {
 		if (strcmp(val, metaslab_allocators[a].msop_name) == 0)
 			return (a);
 	}
 	return (-1);
 }
 
 void
 spa_set_allocator(spa_t *spa, const char *allocator)
 {
 	int a = spa_find_allocator_byname(allocator);
 	if (a < 0) a = 0;
 	spa->spa_active_allocator = a;
 	zfs_dbgmsg("spa allocator: %s", metaslab_allocators[a].msop_name);
 }
 
 int
 spa_get_allocator(spa_t *spa)
 {
 	return (spa->spa_active_allocator);
 }
 
 #if defined(_KERNEL)
 int
 param_set_active_allocator_common(const char *val)
 {
 	char *p;
 
 	if (val == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((p = strchr(val, '\n')) != NULL)
 		*p = '\0';
 
 	int a = spa_find_allocator_byname(val);
 	if (a < 0)
 		return (SET_ERROR(EINVAL));
 
 	zfs_active_allocator = metaslab_allocators[a].msop_name;
 	return (0);
 }
 #endif
 
 metaslab_ops_t *
 metaslab_allocator(spa_t *spa)
 {
 	int allocator = spa_get_allocator(spa);
 	return (&metaslab_allocators[allocator]);
 }
 
 /*
  * ==========================================================================
  * Dynamic Fit (df) block allocator
  *
  * Search for a free chunk of at least this size, starting from the last
  * offset (for this alignment of block) looking for up to
  * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
  * found within 16MB, then return a free chunk of exactly the requested size (or
  * larger).
  *
  * If it seems like searching from the last offset will be unproductive, skip
  * that and just return a free chunk of exactly the requested size (or larger).
  * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
  * mechanism is probably not very useful and may be removed in the future.
  *
  * The behavior when not searching can be changed to return the largest free
  * chunk, instead of a free chunk of exactly the requested size, by setting
  * metaslab_df_use_largest_segment.
  * ==========================================================================
  */
 static uint64_t
-metaslab_df_alloc(metaslab_t *msp, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
+    uint64_t *found_size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
-	uint64_t align = size & -size;
+	uint64_t align = max_size & -max_size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
 	uint64_t offset;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're running low on space, find a segment based on size,
 	 * rather than iterating based on offset.
 	 */
 	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
+		align = size & -size;
+		cursor = &msp->ms_lbas[highbit64(align) - 1];
 		offset = -1;
 	} else {
-		offset = metaslab_block_picker(rt,
-		    cursor, size, metaslab_df_max_search);
+		offset = metaslab_block_picker(rt, cursor, size, max_size,
+		    metaslab_df_max_search, found_size);
+		if (max_size != size && offset == -1) {
+			align = size & -size;
+			cursor = &msp->ms_lbas[highbit64(align) - 1];
+			offset = metaslab_block_picker(rt, cursor, size,
+			    max_size, metaslab_df_max_search, found_size);
+		}
 	}
 
 	if (offset == -1) {
 		zfs_range_seg_t *rs;
 		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 
 		if (metaslab_df_use_largest_segment) {
 			/* use largest free segment */
 			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
 		} else {
 			zfs_btree_index_t where;
 			/* use segment of this size, or next largest */
 			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
-			    rt, msp->ms_start, size, &where);
+			    rt, msp->ms_start, size, max_size, &where);
 		}
 		if (rs != NULL && zfs_rs_get_start(rs, rt) + size <=
 		    zfs_rs_get_end(rs, rt)) {
 			offset = zfs_rs_get_start(rs, rt);
-			*cursor = offset + size;
+			*found_size = MIN(zfs_rs_get_end(rs, rt) - offset,
+			    max_size);
+			*cursor = offset + *found_size;
 		}
 	}
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
-metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
+    uint64_t *found_size)
 {
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		zfs_range_seg_t *rs;
 
 		if (zfs_btree_numnodes(t) == 0)
 			metaslab_size_tree_full_load(msp->ms_allocatable);
 		rs = zfs_btree_last(t, NULL);
 		if (rs == NULL || (zfs_rs_get_end(rs, rt) -
 		    zfs_rs_get_start(rs, rt)) < size)
 			return (-1ULL);
 
 		*cursor = zfs_rs_get_start(rs, rt);
 		*cursor_end = zfs_rs_get_end(rs, rt);
 	}
 
 	offset = *cursor;
-	*cursor += size;
+	*found_size = MIN(*cursor_end - offset, max_size);
+	*cursor = offset + *found_size;
 
 	return (offset);
 }
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
-metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
+    uint64_t *found_size)
 {
 	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs;
 	zfs_range_seg_max_t rsearch;
-	uint64_t hbit = highbit64(size);
+	uint64_t hbit = highbit64(max_size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
-	uint64_t max_size = metaslab_largest_allocatable(msp);
+	uint64_t max_possible_size = metaslab_largest_allocatable(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	if (max_size < size)
+	if (max_possible_size < size)
 		return (-1ULL);
 
 	zfs_rs_set_start(&rsearch, rt, *cursor);
-	zfs_rs_set_end(&rsearch, rt, *cursor + size);
+	zfs_rs_set_end(&rsearch, rt, *cursor + max_size);
 
 	rs = zfs_btree_find(t, &rsearch, &where);
+	if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
+	    max_size) {
+		hbit = highbit64(size);
+		cursor = &msp->ms_lbas[hbit - 1];
+		zfs_rs_set_start(&rsearch, rt, *cursor);
+		zfs_rs_set_end(&rsearch, rt, *cursor + size);
+
+		rs = zfs_btree_find(t, &rsearch, &where);
+	}
 	if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) <
 	    size) {
 		t = &msp->ms_allocatable_by_size;
 
 		zfs_rs_set_start(&rsearch, rt, 0);
-		zfs_rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
-		    metaslab_ndf_clump_shift)));
+		zfs_rs_set_end(&rsearch, rt, MIN(max_possible_size,
+		    1ULL << (hbit + metaslab_ndf_clump_shift)));
 
 		rs = zfs_btree_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = zfs_btree_next(t, &where, &where);
 		ASSERT(rs != NULL);
 	}
 
 	if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) {
-		*cursor = zfs_rs_get_start(rs, rt) + size;
+		*found_size = MIN(zfs_rs_get_end(rs, rt) -
+		    zfs_rs_get_start(rs, rt), max_size);
+		*cursor = zfs_rs_get_start(rs, rt) + *found_size;
 		return (zfs_rs_get_start(rs, rt));
 	}
 	return (-1ULL);
 }
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 /*
  * Wait for any in-progress flushing to complete.
  */
 static void
 metaslab_flush_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_flushing)
 		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
 }
 
 static unsigned int
 metaslab_idx_func(multilist_t *ml, void *arg)
 {
 	metaslab_t *msp = arg;
 
 	/*
 	 * ms_id values are allocated sequentially, so full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml));
 }
 
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
 	return (msp->ms_allocated_space);
 }
 
 /*
  * Verify that the space accounting on disk matches the in-core range_trees.
  */
 static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_condensing);
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can only verify the metaslab space when we're called
 	 * from syncing context with a loaded metaslab that has an
 	 * allocated space map. Calling this in non-syncing context
 	 * does not provide a consistent view of the metaslab since
 	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;
 
 	/*
 	 * Even though the smp_alloc field can get negative,
 	 * when it comes to a metaslab's space map, that should
 	 * never be the case.
 	 */
 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 
 	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
 	    zfs_range_tree_space(msp->ms_unflushed_frees));
 
 	ASSERT3U(metaslab_allocated_space(msp), ==,
 	    space_map_allocated(msp->ms_sm) +
 	    zfs_range_tree_space(msp->ms_unflushed_allocs) -
 	    zfs_range_tree_space(msp->ms_unflushed_frees));
 
 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 
 	/*
 	 * Account for future allocations since we would have
 	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocating +=
 		    zfs_range_tree_space(msp->ms_allocating[(txg + t) &
 		    TXG_MASK]);
 	}
 	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
 	    msp->ms_allocating_total);
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    zfs_range_tree_space(msp->ms_defer[0]) +
 	    zfs_range_tree_space(msp->ms_defer[1]));
 
 	msp_free_space = zfs_range_tree_space(msp->ms_allocatable) +
 	    allocating + msp->ms_deferspace +
 	    zfs_range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
 
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
 	/*
 	 * Auxiliary histograms are only cleared when resetting them,
 	 * which can only happen while the metaslab is loaded.
 	 */
 	ASSERT(msp->ms_loaded);
 
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
 		memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t]));
 }
 
 static void
 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     zfs_range_tree_t *rt)
 {
 	/*
 	 * This is modeled after space_map_histogram_add(), so refer to that
 	 * function for implementation details. We want this to work like
 	 * the space map histogram, and not the range tree histogram, as we
 	 * are essentially constructing a delta that will be later subtracted
 	 * from the space map histogram.
 	 */
 	int idx = 0;
 	for (int i = shift; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(i, >=, idx + shift);
 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 /*
  * Called at every sync pass that the metaslab gets synced.
  *
  * The reason is that we want our auxiliary histograms to be updated
  * wherever the metaslab's space map histogram is updated. This way
  * we stay consistent on which parts of the metaslab space map's
  * histogram are currently not available for allocations (e.g because
  * they are in the defer, freed, and freeing trees).
  */
 static void
 metaslab_aux_histograms_update(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(sm != NULL);
 
 	/*
 	 * This is similar to the metaslab's space map histogram updates
 	 * that take place in metaslab_sync(). The only difference is that
 	 * we only care about segments that haven't made it into the
 	 * ms_allocatable tree yet.
 	 */
 	if (msp->ms_loaded) {
 		metaslab_aux_histograms_clear(msp);
 
 		metaslab_aux_histogram_add(msp->ms_synchist,
 		    sm->sm_shift, msp->ms_freed);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
 			    sm->sm_shift, msp->ms_defer[t]);
 		}
 	}
 
 	metaslab_aux_histogram_add(msp->ms_synchist,
 	    sm->sm_shift, msp->ms_freeing);
 }
 
 /*
  * Called every time we are done syncing (writing to) the metaslab,
  * i.e. at the end of each sync pass.
  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
  */
 static void
 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 
 	if (sm == NULL) {
 		/*
 		 * We came here from metaslab_init() when creating/opening a
 		 * pool, looking at a metaslab that hasn't had any allocations
 		 * yet.
 		 */
 		return;
 	}
 
 	/*
 	 * This is similar to the actions that we take for the ms_freed
 	 * and ms_defer trees in metaslab_sync_done().
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
 		memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist,
 		    sizeof (msp->ms_synchist));
 	} else {
 		memset(msp->ms_deferhist[hist_index], 0,
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
 	memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist));
 }
 
 /*
  * Ensure that the metaslab's weight and fragmentation are consistent
  * with the contents of the histogram (either the range tree's histogram
  * or the space map's depending whether the metaslab is loaded).
  */
 static void
 metaslab_verify_weight_and_frag(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can end up here from vdev_remove_complete(), in which case we
 	 * cannot do these assertions because we hold spa config locks and
 	 * thus we are not allowed to read from the DMU.
 	 *
 	 * We check if the metaslab group has been removed and if that's
 	 * the case we return immediately as that would mean that we are
 	 * here from the aforementioned code path.
 	 */
 	if (msp->ms_group == NULL)
 		return;
 
 	/*
 	 * Devices being removed always return a weight of 0 and leave
 	 * fragmentation and ms_max_size as is - there is nothing for
 	 * us to verify here.
 	 */
 	vdev_t *vd = msp->ms_group->mg_vd;
 	if (vd->vdev_removing)
 		return;
 
 	/*
 	 * If the metaslab is dirty it probably means that we've done
 	 * some allocations or frees that have changed our histograms
 	 * and thus the weight.
 	 */
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
 			return;
 	}
 
 	/*
 	 * This verification checks that our in-memory state is consistent
 	 * with what's on disk. If the pool is read-only then there aren't
 	 * any changes and we just have the initially-loaded state.
 	 */
 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 		return;
 
 	/* some extra verification for in-core tree if you can */
 	if (msp->ms_loaded) {
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		VERIFY(space_map_histogram_verify(msp->ms_sm,
 		    msp->ms_allocatable));
 	}
 
 	uint64_t weight = msp->ms_weight;
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 	uint64_t frag = msp->ms_fragmentation;
 	uint64_t max_segsize = msp->ms_max_size;
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
 
 	/*
 	 * This function is used for verification purposes and thus should
 	 * not introduce any side-effects/mutations on the system's state.
 	 *
 	 * Regardless of whether metaslab_weight() thinks this metaslab
 	 * should be active or not, we want to ensure that the actual weight
 	 * (and therefore the value of ms_weight) would be the same if it
 	 * was to be recalculated at this point.
 	 *
 	 * In addition we set the nodirty flag so metaslab_weight() does
 	 * not dirty the metaslab for future TXGs (e.g. when trying to
 	 * force condensing to upgrade the metaslab spacemaps).
 	 */
 	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
 
 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
 
 	/*
 	 * If the weight type changed then there is no point in doing
 	 * verification. Revert fields to their original values.
 	 */
 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 		msp->ms_fragmentation = frag;
 		msp->ms_weight = weight;
 		return;
 	}
 
 	VERIFY3U(msp->ms_fragmentation, ==, frag);
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
 /*
  * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
  * this class that was used longest ago, and attempt to unload it.  We don't
  * want to spend too much time in this loop to prevent performance
  * degradation, and we expect that most of the time this operation will
  * succeed. Between that and the normal unloading processing during txg sync,
  * we expect this to keep the metaslab memory usage under control.
  */
 static void
 metaslab_potentially_evict(metaslab_class_t *mc)
 {
 #ifdef _KERNEL
 	uint64_t allmem = arc_all_memory();
 	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
 	uint_t tries = 0;
 	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
 	    tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2;
 	    tries++) {
 		unsigned int idx = multilist_get_random_index(
 		    &mc->mc_metaslab_txg_list);
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
 		metaslab_t *msp = multilist_sublist_head(mls);
 		multilist_sublist_unlock(mls);
 		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
 		    inuse * size) {
 			VERIFY3P(mls, ==, multilist_sublist_lock_idx(
 			    &mc->mc_metaslab_txg_list, idx));
 			ASSERT3U(idx, ==,
 			    metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
 
 			if (!multilist_link_active(&msp->ms_class_txg_node)) {
 				multilist_sublist_unlock(mls);
 				break;
 			}
 			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
 			multilist_sublist_unlock(mls);
 			/*
 			 * If the metaslab is currently loading there are two
 			 * cases. If it's the metaslab we're evicting, we
 			 * can't continue on or we'll panic when we attempt to
 			 * recursively lock the mutex. If it's another
 			 * metaslab that's loading, it can be safely skipped,
 			 * since we know it's very new and therefore not a
 			 * good eviction candidate. We check later once the
 			 * lock is held that the metaslab is fully loaded
 			 * before actually unloading it.
 			 */
 			if (msp->ms_loading) {
 				msp = next_msp;
 				inuse =
 				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 				continue;
 			}
 			/*
 			 * We can't unload metaslabs with no spacemap because
 			 * they're not ready to be unloaded yet. We can't
 			 * unload metaslabs with outstanding allocations
 			 * because doing so could cause the metaslab's weight
 			 * to decrease while it's unloaded, which violates an
 			 * invariant that we use to prevent unnecessary
 			 * loading. We also don't unload metaslabs that are
 			 * currently active because they are high-weight
 			 * metaslabs that are likely to be used in the near
 			 * future.
 			 */
 			mutex_enter(&msp->ms_lock);
 			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
 			    msp->ms_allocating_total == 0) {
 				metaslab_unload(msp);
 			}
 			mutex_exit(&msp->ms_lock);
 			msp = next_msp;
 			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
 		}
 	}
 #else
 	(void) mc, (void) zfs_metaslab_mem_limit;
 #endif
 }
 
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We temporarily drop the lock to unblock other operations while we
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
 	 * If we are using the log space maps, metaslab_sync() can't write to
 	 * the metaslab's space map while we are loading as we only write to
 	 * it when we are flushing the metaslab, and that can't happen while
 	 * we are loading it.
 	 *
 	 * If we are not using log space maps though, metaslab_sync() can
 	 * append to the space map while we are loading. Therefore we load
 	 * only entries that existed when we started the load. Additionally,
 	 * metaslab_sync_done() has to wait for the load to complete because
 	 * there are potential races like metaslab_load() loading parts of the
 	 * space map that are currently being appended by metaslab_sync(). If
 	 * we didn't, the ms_allocatable would have entries that
 	 * metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
 	 * ignoring entries appended by metaslab_sync() that happen after we
 	 * drop the lock.
 	 */
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
 	hrtime_t load_start = gethrtime();
 	metaslab_rt_arg_t *mrap;
 	if (msp->ms_allocatable->rt_arg == NULL) {
 		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	} else {
 		mrap = msp->ms_allocatable->rt_arg;
 		msp->ms_allocatable->rt_ops = NULL;
 		msp->ms_allocatable->rt_arg = NULL;
 	}
 	mrap->mra_bt = &msp->ms_allocatable_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
 
 		/* Now, populate the size-sorted tree. */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 
 		struct mssa_arg arg = {0};
 		arg.rt = msp->ms_allocatable;
 		arg.mra = mrap;
 		zfs_range_tree_walk(msp->ms_allocatable,
 		    metaslab_size_sorted_add, &arg);
 	} else {
 		/*
 		 * Add the size-sorted tree first, since we don't need to load
 		 * the metaslab from the spacemap.
 		 */
 		metaslab_rt_create(msp->ms_allocatable, mrap);
 		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
 		msp->ms_allocatable->rt_arg = mrap;
 		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
 		 */
 		zfs_range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 
 		if (msp->ms_new) {
 			/*
 			 * If the ms_sm doesn't exist, this means that this
 			 * metaslab hasn't gone through metaslab_sync() and
 			 * thus has never been dirtied. So we shouldn't
 			 * expect any unflushed allocs or frees from previous
 			 * TXGs.
 			 */
 			ASSERT(zfs_range_tree_is_empty(
 			    msp->ms_unflushed_allocs));
 			ASSERT(zfs_range_tree_is_empty(
 			    msp->ms_unflushed_frees));
 		}
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 	 * changing the ms_sm (or log_sm) and the metaslab's range trees
 	 * while we are about to use them and populate the ms_allocatable.
 	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
 	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	ASSERT(!msp->ms_condensing);
 	ASSERT(!msp->ms_flushing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
 		return (error);
 	}
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;
 
 	/*
 	 * Apply all the unflushed changes to ms_allocatable right
 	 * away so any manipulations we do below have a clear view
 	 * of what is allocated and what is free.
 	 */
 	zfs_range_tree_walk(msp->ms_unflushed_allocs,
 	    zfs_range_tree_remove, msp->ms_allocatable);
 	zfs_range_tree_walk(msp->ms_unflushed_frees,
 	    zfs_range_tree_add, msp->ms_allocatable);
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	if (spa_syncing_log_sm(spa) != NULL) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LOG_SPACEMAP));
 
 		/*
 		 * If we use a log space map we add all the segments
 		 * that are in ms_unflushed_frees so they are available
 		 * for allocation.
 		 *
 		 * ms_allocatable needs to contain all free segments
 		 * that are ready for allocations (thus not segments
 		 * from ms_freeing, ms_freed, and the ms_defer trees).
 		 * But if we grab the lock in this code path at a sync
 		 * pass later that 1, then it also contains the
 		 * segments of ms_freed (they were added to it earlier
 		 * in this path through ms_unflushed_frees). So we
 		 * need to remove all the segments that exist in
 		 * ms_freed from ms_allocatable as they will be added
 		 * later in metaslab_sync_done().
 		 *
 		 * When there's no log space map, the ms_allocatable
 		 * correctly doesn't contain any segments that exist
 		 * in ms_freed [see ms_synced_length].
 		 */
 		zfs_range_tree_walk(msp->ms_freed,
 		    zfs_range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * If we are not using the log space map, ms_allocatable
 	 * contains the segments that exist in the ms_defer trees
 	 * [see ms_synced_length]. Thus we need to remove them
 	 * from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
 	 *
 	 * If we are using the log space map, ms_allocatable still
 	 * contains the segments that exist in the ms_defer trees.
 	 * Not because it read them through the ms_sm though. But
 	 * because these segments are part of ms_unflushed_frees
 	 * whose segments we add to ms_allocatable earlier in this
 	 * code path.
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
 		    zfs_range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * Call metaslab_recalculate_weight_and_sort() now that the
 	 * metaslab is loaded so we get the metaslab's real weight.
 	 *
 	 * Unless this metaslab was created with older software and
 	 * has not yet been converted to use segment-based weight, we
 	 * expect the new weight to be better or equal to the weight
 	 * that the metaslab had while it was not loaded. This is
 	 * because the old weight does not take into account the
 	 * consolidation of adjacent segments between TXGs. [see
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
 	uint64_t max_size = msp->ms_max_size;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	ASSERT3U(max_size, <=, msp->ms_max_size);
 	hrtime_t load_end = gethrtime();
 	msp->ms_load_time = load_end;
 	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
 	    "ms_id %llu, smp_length %llu, "
 	    "unflushed_allocs %llu, unflushed_frees %llu, "
 	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
 	    "loading_time %lld ms, ms_max_size %llu, "
 	    "max size error %lld, "
 	    "old_weight %llx, new_weight %llx",
 	    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    (u_longlong_t)msp->ms_id,
 	    (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_freed),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]),
 	    (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]),
 	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
 	    (longlong_t)((load_end - load_start) / 1000000),
 	    (u_longlong_t)msp->ms_max_size,
 	    (u_longlong_t)msp->ms_max_size - max_size,
 	    (u_longlong_t)weight, (u_longlong_t)msp->ms_weight);
 
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
 	return (0);
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * There may be another thread loading the same metaslab, if that's
 	 * the case just wait until the other thread is done and return.
 	 */
 	metaslab_load_wait(msp);
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We set the loading flag BEFORE potentially dropping the lock to
 	 * wait for an ongoing flush (see ms_flushing below). This way other
 	 * threads know that there is already a thread that is loading this
 	 * metaslab.
 	 */
 	msp->ms_loading = B_TRUE;
 
 	/*
 	 * Wait for any in-progress flushing to finish as we drop the ms_lock
 	 * both here (during space_map_load()) and in metaslab_flush() (when
 	 * we flush our changes to the ms_sm).
 	 */
 	if (msp->ms_flushing)
 		metaslab_flush_wait(msp);
 
 	/*
 	 * In the possibility that we were waiting for the metaslab to be
 	 * flushed (where we temporarily dropped the ms_lock), ensure that
 	 * no one else loaded the metaslab somehow.
 	 */
 	ASSERT(!msp->ms_loaded);
 
 	/*
 	 * If we're loading a metaslab in the normal class, consider evicting
 	 * another one to keep our memory usage under the limit defined by the
 	 * zfs_metaslab_mem_limit tunable.
 	 */
 	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
 	    msp->ms_group->mg_class) {
 		metaslab_potentially_evict(msp->ms_group->mg_class);
 	}
 
 	int error = metaslab_load_impl(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * This can happen if a metaslab is selected for eviction (in
 	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
 	 * metaslab_class_evict_old).
 	 */
 	if (!msp->ms_loaded)
 		return;
 
 	zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 	msp->ms_unload_time = gethrtime();
 
 	msp->ms_activation_weight = 0;
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 
 	if (msp->ms_group != NULL) {
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (multilist_link_active(&msp->ms_class_txg_node))
 			multilist_sublist_remove(mls, msp);
 		multilist_sublist_unlock(mls);
 
 		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, weight %llx, "
 		    "selected txg %llu (%llu s ago), alloc_txg %llu, "
 		    "loaded %llu ms ago, max_size %llu",
 		    (u_longlong_t)spa_syncing_txg(spa), spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)msp->ms_weight,
 		    (u_longlong_t)msp->ms_selected_txg,
 		    (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) -
 		    msp->ms_selected_time),
 		    (u_longlong_t)msp->ms_alloc_txg,
 		    (u_longlong_t)(msp->ms_unload_time -
 		    msp->ms_load_time) / 1000 / 1000,
 		    (u_longlong_t)msp->ms_max_size);
 	}
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
 	 * map (as it is now not loaded). We want unload metaslabs to always
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
 	 * available in-core, whether it is loaded or not.
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
 	 * and the sorting.
 	 */
 	if (msp->ms_group != NULL)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
 /*
  * We want to optimize the memory use of the per-metaslab range
  * trees. To do this, we store the segments in the range trees in
  * units of sectors, zero-indexing from the start of the metaslab. If
  * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
  * the ranges using two uint32_ts, rather than two uint64_ts.
  */
 zfs_range_seg_type_t
 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
     uint64_t *start, uint64_t *shift)
 {
 	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
 	    !zfs_metaslab_force_large_segs) {
 		*shift = vdev->vdev_ashift;
 		*start = msp->ms_start;
 		return (ZFS_RANGE_SEG32);
 	} else {
 		*shift = 0;
 		*start = 0;
 		return (ZFS_RANGE_SEG64);
 	}
 }
 
 void
 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	multilist_sublist_t *mls =
 	    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 	if (multilist_link_active(&msp->ms_class_txg_node))
 		multilist_sublist_remove(mls, msp);
 	msp->ms_selected_txg = txg;
 	msp->ms_selected_time = gethrestime_sec();
 	multilist_sublist_insert_tail(mls, msp);
 	multilist_sublist_unlock(mls);
 }
 
 void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 
 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 	ASSERT(vd->vdev_ms_count != 0);
 
 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 	    vdev_deflated_space(vd, space_delta));
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
     uint64_t txg, metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&ms->ms_class_txg_node);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;
 
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops->vdev_op_metaslab_init != NULL)
 		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it. For
 	 * readonly pools there is no need to open the space map object.
 	 *
 	 * Note:
 	 * When called from vdev_expand(), we can't call into the DMU as
 	 * we are holding the spa_config_lock as a writer and we would
 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
 	 * that case, the object parameter is zero though, so we won't
 	 * call into the DMU.
 	 */
 	if (object != 0 && !(spa->spa_mode == SPA_MODE_READ &&
 	    !spa->spa_read_spacemaps)) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
 	uint64_t shift, start;
 	zfs_range_seg_type_t type =
 	    metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
 
 	ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
 	    shift);
 	for (int t = 0; t < TXG_SIZE; t++) {
 		ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
 		    NULL, start, shift);
 	}
 	ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
 		    start, shift);
 	}
 	ms->ms_checkpointing =
 	    zfs_range_tree_create(NULL, type, NULL, start, shift);
 	ms->ms_unflushed_allocs =
 	    zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
 	mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
 	mrap->mra_floor_shift = metaslab_by_size_min_shift;
 	ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
 	    type, mrap, start, shift);
 
 	ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	metaslab_group_add(mg, ms);
 	metaslab_set_fragmentation(ms, B_FALSE);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 * The metaslab's weight will also be initialized when we sync
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
 		metaslab_space_update(vd, mg->mg_class,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 static void
 metaslab_fini_flush_data(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (metaslab_unflushed_txg(msp) == 0) {
 		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
 		    ==, NULL);
 		return;
 	}
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
 	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp),
 	    metaslab_unflushed_dirty(msp));
 }
 
 uint64_t
 metaslab_unflushed_changes_memused(metaslab_t *ms)
 {
 	return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) +
 	    zfs_range_tree_numsegs(ms->ms_unflushed_frees)) *
 	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 
 	metaslab_fini_flush_data(msp);
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 
 	/*
 	 * If this metaslab hasn't been through metaslab_sync_done() yet its
 	 * space hasn't been accounted for in its vdev and doesn't need to be
 	 * subtracted.
 	 */
 	if (!msp->ms_new) {
 		metaslab_space_update(vd, mg->mg_class,
 		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	}
 	space_map_close(msp->ms_sm);
 	msp->ms_sm = NULL;
 
 	metaslab_unload(msp);
 
 	zfs_range_tree_destroy(msp->ms_allocatable);
 	zfs_range_tree_destroy(msp->ms_freeing);
 	zfs_range_tree_destroy(msp->ms_freed);
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_unflushed_allocs);
 	zfs_range_tree_destroy(msp->ms_checkpointing);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_unflushed_frees);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		zfs_range_tree_destroy(msp->ms_allocating[t]);
 	}
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_destroy(msp->ms_defer[t]);
 	}
 	ASSERT0(msp->ms_deferspace);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
 	zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 	zfs_range_tree_destroy(msp->ms_trim);
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	cv_destroy(&msp->ms_flush_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmentation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments.
  *
  * This table defines 0% fragmented space using 512M segments. Using this value,
  * we derive the rest of the table. This table originally went up to 16MB, but
  * with larger recordsizes, larger ashifts, and use of raidz3, it is possible
  * to have significantly larger allocations than were previously possible.
  * Since the fragmentation value is never stored on disk, it is possible to
  * change these calculations in the future.
  */
 static const int zfs_frag_table[] = {
 	100,	/* 512B	*/
 	99,	/* 1K	*/
 	97,	/* 2K	*/
 	93,	/* 4K	*/
 	88,	/* 8K	*/
 	83,	/* 16K	*/
 	77,	/* 32K	*/
 	71,	/* 64K	*/
 	64,	/* 128K	*/
 	57,	/* 256K	*/
 	50,	/* 512K	*/
 	43,	/* 1M	*/
 	36,	/* 2M	*/
 	29,	/* 4M	*/
 	23,	/* 8M	*/
 	17,	/* 16M	*/
 	12,	/* 32M	*/
 	7,	/* 64M	*/
 	3,	/* 128M	*/
 	1,	/* 256M	*/
 	0,	/* 512M	*/
 };
 #define	FRAGMENTATION_TABLE_SIZE \
 	(sizeof (zfs_frag_table)/(sizeof (zfs_frag_table[0])))
 
 /*
  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
  * been upgraded and does not support this metric. Otherwise, the return
  * value should be in the range [0, 100].
  */
 static void
 metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled) {
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL) {
 		msp->ms_fragmentation = 0;
 		return;
 	}
 
 	/*
 	 * If this metaslab's space map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		/*
 		 * If we've reached the final dirty txg, then we must
 		 * be shutting down the pool. We don't want to dirty
 		 * any data past this point so skip setting the condense
 		 * flag. We can retry this action the next time the pool
 		 * is imported. We also skip marking this metaslab for
 		 * condensing if the caller has explicitly set nodirty.
 		 */
 		if (!nodirty &&
 		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", (u_longlong_t)txg,
 			    (u_longlong_t)msp->ms_id,
 			    (u_longlong_t)vd->vdev_id);
 		}
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 
 	msp->ms_fragmentation = fragmentation;
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - metaslab_allocated_space(msp);
 
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	WEIGHT_SET_SPACEBASED(weight);
 	return (weight);
 }
 
 /*
  * Return the weight of the specified metaslab, according to the segment-based
  * weighting algorithm. The metaslab must be loaded. This function can
  * be called within a sync pass since it relies only on the metaslab's
  * range tree which is always accurate when the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight_from_range_tree(metaslab_t *msp)
 {
 	uint64_t weight = 0;
 	uint32_t segments = 0;
 
 	ASSERT(msp->ms_loaded);
 
 	for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 	    i--) {
 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
 		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
 		 * and must be downgraded so that all values fit within the
 		 * space map's histogram. This allows us to compare loaded
 		 * vs. unloaded metaslabs to determine which metaslab is
 		 * considered "best".
 		 */
 		if (i > max_idx)
 			continue;
 
 		if (segments != 0) {
 			WEIGHT_SET_COUNT(weight, segments);
 			WEIGHT_SET_INDEX(weight, i);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Calculate the weight based on the on-disk histogram. Should be applied
  * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
  * give results consistent with the on-disk state
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(!msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(space_map_object(sm), !=, 0);
 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * Create a joint histogram from all the segments that have made
 	 * it to the metaslab's space map histogram, that are not yet
 	 * available for allocation because they are still in the freeing
 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 	 * these segments from the space map's histogram to get a more
 	 * accurate weight.
 	 */
 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 		deferspace_histogram[i] += msp->ms_synchist[i];
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
 		}
 	}
 
 	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 		    deferspace_histogram[i]);
 		uint64_t count =
 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 		if (count != 0) {
 			WEIGHT_SET_COUNT(weight, count);
 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Compute a segment-based weight for the specified metaslab. The weight
  * is determined by highest bucket in the histogram. The information
  * for the highest bucket is encoded into the weight value.
  */
 static uint64_t
 metaslab_segment_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	uint64_t weight = 0;
 	uint8_t shift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The metaslab is completely free.
 	 */
 	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		if (idx < max_idx) {
 			WEIGHT_SET_COUNT(weight, 1ULL);
 			WEIGHT_SET_INDEX(weight, idx);
 		} else {
 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 			WEIGHT_SET_INDEX(weight, max_idx);
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 		return (weight);
 	}
 
 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
 	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
 	 * determine the weight. Otherwise, we rely on the space map information
 	 * to generate the weight.
 	 */
 	if (msp->ms_loaded) {
 		weight = metaslab_weight_from_range_tree(msp);
 	} else {
 		weight = metaslab_weight_from_spacemap(msp);
 	}
 
 	/*
 	 * If the metaslab was active the last time we calculated its weight
 	 * then keep it active. We want to consume the entire region that
 	 * is associated with this weight.
 	 */
 	if (msp->ms_activation_weight != 0 && weight != 0)
 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 	return (weight);
 }
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
  * metaslab is loaded, then we can determine if the desired allocation
  * can be satisfied by looking at the size of the maximum free segment
  * on that metaslab. Otherwise, we make our decision based on the metaslab's
  * weight. For segment-based weighting we can determine the maximum
  * allocation based on the index encoded in its value. For space-based
  * weights we rely on the entire weight (excluding the weight-type bit).
  */
 static boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
 {
 	/*
 	 * This case will usually but not always get caught by the checks below;
 	 * metaslabs can be loaded by various means, including the trim and
 	 * initialize code. Once that happens, without this check they are
 	 * allocatable even before they finish their first txg sync.
 	 */
 	if (unlikely(msp->ms_new))
 		return (B_FALSE);
 
 	/*
 	 * If the metaslab is loaded, ms_max_size is definitive and we can use
 	 * the fast check. If it's not, the ms_max_size is a lower bound (once
 	 * set), and we should use the fast check as long as we're not in
 	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
 	 * seconds since the metaslab was unloaded.
 	 */
 	if (msp->ms_loaded ||
 	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
 	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
 		return (msp->ms_max_size >= asize);
 
 	boolean_t should_allocate;
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
 		 * Since the asize might be in the middle of the range, we
 		 * should attempt the allocation if asize < 2^(i+1).
 		 */
 		should_allocate = (asize <
 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 	} else {
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
 
 	return (should_allocate);
 }
 
 static uint64_t
 metaslab_weight(metaslab_t *msp, boolean_t nodirty)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t weight;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	metaslab_set_fragmentation(msp, nodirty);
 
 	/*
 	 * Update the maximum size. If the metaslab is loaded, this will
 	 * ensure that we get an accurate maximum size if newly freed space
 	 * has been added back into the free tree. If the metaslab is
 	 * unloaded, we check if there's a larger free segment in the
 	 * unflushed frees. This is a lower bound on the largest allocatable
 	 * segment size. Coalescing of adjacent entries may reveal larger
 	 * allocatable segments, but we aren't aware of those until loading
 	 * the space map into a range tree.
 	 */
 	if (msp->ms_loaded) {
 		msp->ms_max_size = metaslab_largest_allocatable(msp);
 	} else {
 		msp->ms_max_size = MAX(msp->ms_max_size,
 		    metaslab_largest_unflushed_free(msp));
 	}
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
 	 */
 	if (zfs_metaslab_segment_weight_enabled &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 	    sizeof (space_map_phys_t))) {
 		weight = metaslab_segment_weight(msp);
 	} else {
 		weight = metaslab_space_weight(msp);
 	}
 	return (weight);
 }
 
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
 	    metaslab_weight(msp, B_FALSE) | was_active);
 }
 
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
 	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
 		ASSERT0(msp->ms_activation_weight);
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(mg, msp, msp->ms_weight |
 		    activation_weight);
 		return (0);
 	}
 
 	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    &mga->mga_primary : &mga->mga_secondary);
 
 	mutex_enter(&mg->mg_lock);
 	if (*mspp != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
 	*mspp = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 
 	ASSERT0(msp->ms_activation_weight);
 	msp->ms_activation_weight = msp->ms_weight;
 	metaslab_group_sort_impl(mg, msp,
 	    msp->ms_weight | activation_weight);
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The current metaslab is already activated for us so there
 	 * is nothing to do. Already activated though, doesn't mean
 	 * that this metaslab is activated for our allocator nor our
 	 * requested activation weight. The metaslab could have started
 	 * as an active one for our allocator but changed allocators
 	 * while we were waiting to grab its ms_lock or we stole it
 	 * [see find_valid_metaslab()]. This means that there is a
 	 * possibility of passivating a metaslab of another allocator
 	 * or from a different activation mask, from this thread.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		ASSERT(msp->ms_loaded);
 		return (0);
 	}
 
 	int error = metaslab_load(msp);
 	if (error != 0) {
 		metaslab_group_sort(msp->ms_group, msp, 0);
 		return (error);
 	}
 
 	/*
 	 * When entering metaslab_load() we may have dropped the
 	 * ms_lock because we were loading this metaslab, or we
 	 * were waiting for another thread to load it for us. In
 	 * that scenario, we recheck the weight of the metaslab
 	 * to see if it was activated by another thread.
 	 *
 	 * If the metaslab was activated for another allocator or
 	 * it was activated with a different activation weight (e.g.
 	 * we wanted to make it a primary but it was activated as
 	 * secondary) we return error (EBUSY).
 	 *
 	 * If the metaslab was activated for the same allocator
 	 * and requested activation mask, skip activating it.
 	 */
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 		if (msp->ms_allocator != allocator)
 			return (EBUSY);
 
 		if ((msp->ms_weight & activation_weight) == 0)
 			return (SET_ERROR(EBUSY));
 
 		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
 		    msp->ms_primary);
 		return (0);
 	}
 
 	/*
 	 * If the metaslab has literally 0 space, it will have weight 0. In
 	 * that case, don't bother activating it. This can happen if the
 	 * metaslab had space during find_valid_metaslab, but another thread
 	 * loaded it and used all that space while we were waiting to grab the
 	 * lock.
 	 */
 	if (msp->ms_weight == 0) {
 		ASSERT0(zfs_range_tree_space(msp->ms_allocatable));
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 	    allocator, activation_weight)) != 0) {
 		return (error);
 	}
 
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
 	ASSERT3S(0, <=, msp->ms_allocator);
 	ASSERT3U(msp->ms_allocator, <, mg->mg_class->mc_spa->spa_alloc_count);
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
 	if (msp->ms_primary) {
 		ASSERT3P(mga->mga_primary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mga->mga_primary = NULL;
 	} else {
 		ASSERT3P(mga->mga_secondary, ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		mga->mga_secondary = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
 
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
 	    size >= SPA_MINBLOCKSIZE ||
 	    zfs_range_tree_space(msp->ms_allocatable) == 0);
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	ASSERT(msp->ms_activation_weight != 0);
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
 }
 
 /*
  * Segment-based metaslabs are activated once and remain active until
  * we either fail an allocation attempt (similar to space-based metaslabs)
  * or have exhausted the free space in zfs_metaslab_switch_threshold
  * buckets since the metaslab was activated. This function checks to see
  * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
  * metaslab and passivates it proactively. This will allow us to select a
  * metaslab with a larger contiguous region, if any, remaining within this
  * metaslab group. If we're in sync pass > 1, then we continue using this
  * metaslab so that we don't dirty more block and cause more sync passes.
  */
 static void
 metaslab_segment_may_passivate(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * As long as a single largest free segment covers majorioty of free
 	 * space, don't consider the metaslab fragmented.  It should allow
 	 * us to fill new unfragmented metaslabs full before switching.
 	 */
 	if (metaslab_largest_allocatable(msp) >
 	    zfs_range_tree_space(msp->ms_allocatable) * 15 / 16)
 		return;
 
 	/*
 	 * Since we are in the middle of a sync pass, the most accurate
 	 * information that is accessible to us is the in-core range tree
 	 * histogram; calculate the new weight based on that information.
 	 */
 	uint64_t weight = metaslab_weight_from_range_tree(msp);
 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 	int current_idx = WEIGHT_GET_INDEX(weight);
 
 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 		metaslab_passivate(msp, weight);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 	spa_t *spa = mc->mc_spa;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	(void) metaslab_load(msp);
 	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 
 	/*
 	 * Load the next potential metaslabs
 	 */
 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 		ASSERT3P(msp->ms_group, ==, mg);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			continue;
 		}
 
 		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
 		    msp, TQ_SLEEP | (m <= spa->spa_alloc_count ? TQ_FRONT : 0))
 		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance for
  * inefficiency. We would like to use the following criteria to make our
  * decision:
  *
  * 1. Do not condense if the size of the space map object would dramatically
  *    increase as a result of writing out the free space range tree.
  *
  * 2. Condense if the on on-disk space map representation is at least
  *    zfs_condense_pct/100 times the size of the optimal representation
  *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
  *
  * 3. Do not condense if the on-disk size of the space map does not actually
  *    decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
 	if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
 	uint64_t object_size = space_map_length(sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed
  * by the entries of the free range tree (ms_allocatable). The condensed
  * spacemap contains all the entries of previous TXGs (including those in
  * the pool-wide log spacemaps; thus this is effectively a superset of
  * metaslab_flush()), but this TXG's entries still need to be written.
  */
 static void
 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
 {
 	zfs_range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_sm != NULL);
 
 	/*
 	 * In order to condense the space map, we need to change it so it
 	 * only describes which segments are currently allocated and free.
 	 *
 	 * All the current free space resides in the ms_allocatable, all
 	 * the ms_defer trees, and all the ms_allocating trees. We ignore
 	 * ms_freed because it is empty because we're in sync pass 1. We
 	 * ignore ms_freeing because these changes are not yet reflected
 	 * in the spacemap (they will be written later this txg).
 	 *
 	 * So to truncate the space map to represent all the entries of
 	 * previous TXGs we do the following:
 	 *
 	 * 1] We create a range tree (condense tree) that is 100% empty.
 	 * 2] We add to it all segments found in the ms_defer trees
 	 *    as those segments are marked as free in the original space
 	 *    map. We do the same with the ms_allocating trees for the same
 	 *    reason. Adding these segments should be a relatively
 	 *    inexpensive operation since we expect these trees to have a
 	 *    small number of nodes.
 	 * 3] We vacate any unflushed allocs, since they are not frees we
 	 *    need to add to the condense tree. Then we vacate any
 	 *    unflushed frees as they should already be part of ms_allocatable.
 	 * 4] At this point, we would ideally like to add all segments
 	 *    in the ms_allocatable tree from the condense tree. This way
 	 *    we would write all the entries of the condense tree as the
 	 *    condensed space map, which would only contain freed
 	 *    segments with everything else assumed to be allocated.
 	 *
 	 *    Doing so can be prohibitively expensive as ms_allocatable can
 	 *    be large, and therefore computationally expensive to add to
 	 *    the condense_tree. Instead we first sync out an entry marking
 	 *    everything as allocated, then the condense_tree and then the
 	 *    ms_allocatable, in the condensed space map. While this is not
 	 *    optimal, it is typically close to optimal and more importantly
 	 *    much cheaper to compute.
 	 *
 	 * 5] Finally, as both of the unflushed trees were written to our
 	 *    new and condensed metaslab space map, we basically flushed
 	 *    all the unflushed changes to disk, thus we call
 	 *    metaslab_flush_update().
 	 */
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %llu, forcing condense=%s",
 	    (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp,
 	    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 	    spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm),
 	    (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	zfs_range_seg_type_t type;
 	uint64_t shift, start;
 	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
 	    &start, &shift);
 
 	condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		zfs_range_tree_walk(msp->ms_defer[t],
 		    zfs_range_tree_add, condense_tree);
 	}
 
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    zfs_range_tree_add, condense_tree);
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing other
 	 * consumers to change it's content. Set the metaslab's ms_condensing
 	 * flag to ensure that allocations on this metaslab do not occur
 	 * while we're in the middle of committing it to disk. This is only
 	 * critical for ms_allocatable as all other range trees use per TXG
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	uint64_t object = space_map_object(msp->ms_sm);
 	space_map_truncate(sm,
 	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
 
 	/*
 	 * space_map_truncate() may have reallocated the spacemap object.
 	 * If so, update the vdev_ms_array.
 	 */
 	if (space_map_object(msp->ms_sm) != object) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(spa->spa_meta_objset,
 		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 
 	/*
 	 * Note:
 	 * When the log space map feature is enabled, each space map will
 	 * always have ALLOCS followed by FREES for each sync pass. This is
 	 * typically true even when the log space map feature is disabled,
 	 * except from the case where a metaslab goes through metaslab_sync()
 	 * and gets condensed. In that case the metaslab's space map will have
 	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
 	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
 	 * sync pass 1.
 	 */
 	zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL,
 	    start, shift);
 	zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
 	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
 
 	zfs_range_tree_vacate(condense_tree, NULL, NULL);
 	zfs_range_tree_destroy(condense_tree);
 	zfs_range_tree_vacate(tmp_tree, NULL, NULL);
 	zfs_range_tree_destroy(tmp_tree);
 	mutex_enter(&msp->ms_lock);
 
 	msp->ms_condensing = B_FALSE;
 	metaslab_flush_update(msp, tx);
 }
 
 static void
 metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, B_TRUE);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	spa_log_sm_increment_current_mscount(spa);
 	spa_log_summary_add_flushed_metaslab(spa, B_TRUE);
 }
 
 void
 metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	ASSERT(spa_syncing_log_sm(spa) != NULL);
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 	ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
 
 	/* update metaslab's position in our flushing tree */
 	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
 	boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp);
 	mutex_enter(&spa->spa_flushed_ms_lock);
 	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
 	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
 	metaslab_set_unflushed_dirty(msp, dirty);
 	avl_add(&spa->spa_metaslabs_by_flushed, msp);
 	mutex_exit(&spa->spa_flushed_ms_lock);
 
 	/* update metaslab counts of spa_log_sm_t nodes */
 	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
 	spa_log_sm_increment_current_mscount(spa);
 
 	/* update log space map summary */
 	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg,
 	    ms_prev_flushed_dirty);
 	spa_log_summary_add_flushed_metaslab(spa, dirty);
 
 	/* cleanup obsolete logs if any */
 	spa_cleanup_old_sm_logs(spa, tx);
 }
 
 /*
  * Called when the metaslab has been flushed (its own spacemap now reflects
  * all the contents of the pool-wide spacemap log). Updates the metaslab's
  * metadata and any pool-wide related log space map data (e.g. summary,
  * obsolete logs, etc..) to reflect that.
  */
 static void
 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 
 	/*
 	 * Just because a metaslab got flushed, that doesn't mean that
 	 * it will pass through metaslab_sync_done(). Thus, make sure to
 	 * update ms_synced_length here in case it doesn't.
 	 */
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	/*
 	 * We may end up here from metaslab_condense() without the
 	 * feature being active. In that case this is a no-op.
 	 */
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) ||
 	    metaslab_unflushed_txg(msp) == 0)
 		return;
 
 	metaslab_unflushed_bump(msp, tx, B_FALSE);
 }
 
 boolean_t
 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(spa_sync_pass(spa), ==, 1);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	ASSERT(msp->ms_sm != NULL);
 	ASSERT(metaslab_unflushed_txg(msp) != 0);
 	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
 
 	/*
 	 * There is nothing wrong with flushing the same metaslab twice, as
 	 * this codepath should work on that case. However, the current
 	 * flushing scheme makes sure to avoid this situation as we would be
 	 * making all these calls without having anything meaningful to write
 	 * to disk. We assert this behavior here.
 	 */
 	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
 
 	/*
 	 * We can not flush while loading, because then we would
 	 * not load the ms_unflushed_{allocs,frees}.
 	 */
 	if (msp->ms_loading)
 		return (B_FALSE);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	/*
 	 * Metaslab condensing is effectively flushing. Therefore if the
 	 * metaslab can be condensed we can just condense it instead of
 	 * flushing it.
 	 *
 	 * Note that metaslab_condense() does call metaslab_flush_update()
 	 * so we can just return immediately after condensing. We also
 	 * don't need to care about setting ms_flushing or broadcasting
 	 * ms_flush_cv, even if we temporarily drop the ms_lock in
 	 * metaslab_condense(), as the metaslab is already loaded.
 	 */
 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_group_t *mg = msp->ms_group;
 
 		/*
 		 * For all histogram operations below refer to the
 		 * comments of metaslab_sync() where we follow a
 		 * similar procedure.
 		 */
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 		metaslab_group_histogram_remove(mg, msp);
 
 		metaslab_condense(msp, tx);
 
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 		ASSERT(zfs_range_tree_is_empty(msp->ms_freed));
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 		metaslab_aux_histograms_update(msp);
 
 		metaslab_group_histogram_add(mg, msp);
 		metaslab_group_histogram_verify(mg);
 		metaslab_class_histogram_verify(mg->mg_class);
 
 		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 
 		/*
 		 * Since we recreated the histogram (and potentially
 		 * the ms_sm too while condensing) ensure that the
 		 * weight is updated too because we are not guaranteed
 		 * that this metaslab is dirty and will go through
 		 * metaslab_sync_done().
 		 */
 		metaslab_recalculate_weight_and_sort(msp);
 		return (B_TRUE);
 	}
 
 	msp->ms_flushing = B_TRUE;
 	uint64_t sm_len_before = space_map_length(msp->ms_sm);
 
 	mutex_exit(&msp->ms_lock);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
 	    SM_NO_VDEVID, tx);
 	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
 	    SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 
 	uint64_t sm_len_after = space_map_length(msp->ms_sm);
 	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
 		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
 		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
 		    "appended %llu bytes", (u_longlong_t)dmu_tx_get_txg(tx),
 		    spa_name(spa),
 		    (u_longlong_t)msp->ms_group->mg_vd->vdev_id,
 		    (u_longlong_t)msp->ms_id,
 		    (u_longlong_t)zfs_range_tree_space(
 		    msp->ms_unflushed_allocs),
 		    (u_longlong_t)zfs_range_tree_space(
 		    msp->ms_unflushed_frees),
 		    (u_longlong_t)(sm_len_after - sm_len_before));
 	}
 
 	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 	    metaslab_unflushed_changes_memused(msp));
 	spa->spa_unflushed_stats.sus_memused -=
 	    metaslab_unflushed_changes_memused(msp);
 	zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
 	zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	metaslab_flush_update(msp, tx);
 
 	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
 	metaslab_verify_weight_and_frag(msp);
 
 	msp->ms_flushing = B_FALSE;
 	cv_broadcast(&msp->ms_flush_cv);
 	return (B_TRUE);
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (msp->ms_new) {
 		ASSERT0(zfs_range_tree_space(alloctree));
 		ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 		ASSERT0(zfs_range_tree_space(msp->ms_freed));
 		ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 		ASSERT0(zfs_range_tree_space(msp->ms_trim));
 		return;
 	}
 
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
 	 * forced to condense, it's loaded and we're not beyond the final
 	 * dirty txg, we need to let it through. Not condensing beyond the
 	 * final dirty txg prevents an issue where metaslabs that need to be
 	 * condensed but were loaded for other reasons could cause a panic
 	 * here. By only checking the txg in that branch of the conditional,
 	 * we preserve the utility of the VERIFY statements in all other
 	 * cases.
 	 */
 	if (zfs_range_tree_is_empty(alloctree) &&
 	    zfs_range_tree_is_empty(msp->ms_freeing) &&
 	    zfs_range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted &&
 	    txg <= spa_final_dirty_txg(spa)))
 		return;
 
 
 	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
 	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
 	 * could call into the DMU, because the DMU can call down to
 	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
 	 * concurrently, and it is locked out by the ms_sync_lock.
 	 * Note that the ms_lock is insufficient for this, because it
 	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	/*
 	 * Generate a log space map if one doesn't exist already.
 	 */
 	spa_generate_syncing_log_sm(spa, tx);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object = space_map_alloc(mos,
 		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
 		    zfs_metaslab_sm_blksz_with_log :
 		    zfs_metaslab_sm_blksz_no_log, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
 		ASSERT(msp->ms_sm != NULL);
 
 		ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs));
 		ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees));
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
 	if (!zfs_range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
 		    zfs_vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * We save the space map object as an entry in vdev_top_zap
 		 * so it can be retrieved when the pool is reopened after an
 		 * export or through zdb.
 		 */
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (new_object), 1, &new_object, tx));
 	}
 
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
 	    metaslab_should_condense(msp))
 		metaslab_condense(msp, tx);
 
 	/*
 	 * We'll be going to disk to sync our space accounting, thus we
 	 * drop the ms_lock during that time so allocations coming from
 	 * open-context (ZIL) for future TXGs do not block.
 	 */
 	mutex_exit(&msp->ms_lock);
 	space_map_t *log_sm = spa_syncing_log_sm(spa);
 	if (log_sm != NULL) {
 		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 		if (metaslab_unflushed_txg(msp) == 0)
 			metaslab_unflushed_add(msp, tx);
 		else if (!metaslab_unflushed_dirty(msp))
 			metaslab_unflushed_bump(msp, tx, B_TRUE);
 
 		space_map_write(log_sm, alloctree, SM_ALLOC,
 		    vd->vdev_id, tx);
 		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
 		    vd->vdev_id, tx);
 		mutex_enter(&msp->ms_lock);
 
 		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
 		    metaslab_unflushed_changes_memused(msp));
 		spa->spa_unflushed_stats.sus_memused -=
 		    metaslab_unflushed_changes_memused(msp);
 		zfs_range_tree_remove_xor_add(alloctree,
 		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
 		zfs_range_tree_remove_xor_add(msp->ms_freeing,
 		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
 		spa->spa_unflushed_stats.sus_memused +=
 		    metaslab_unflushed_changes_memused(msp);
 	} else {
 		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
 	msp->ms_allocated_space += zfs_range_tree_space(alloctree);
 	ASSERT3U(msp->ms_allocated_space, >=,
 	    zfs_range_tree_space(msp->ms_freeing));
 	msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing);
 
 	if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
 		 * ms_lock while writing to the checkpoint space map, for the
 		 * same reason mentioned above.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 
 		spa->spa_checkpoint_info.sci_dspace +=
 		    zfs_range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    zfs_range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 		    -space_map_allocated(vd->vdev_checkpoint_sm));
 
 		zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
 		 * any free space that has already been processed, plus
 		 * any deferred space. This allows the on-disk histogram
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
 		 * added back into the in-core free tree yet. This will
 		 * ensure that we don't end up with a space map histogram
 		 * that is completely empty unless the metaslab is fully
 		 * allocated.
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 	}
 
 	/*
 	 * Always add the free space from this sync pass to the space
 	 * map histogram. We want to make sure that the on-disk histogram
 	 * accounts for all free space. If the space map is not loaded,
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 	metaslab_aux_histograms_update(msp);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
 	 *
 	 * Keep in mind that even if we are currently using a log spacemap
 	 * we want current frees to end up in the ms_allocatable (but not
 	 * get appended to the ms_sm) so their ranges can be reused as usual.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		zfs_range_tree_vacate(msp->ms_freeing,
 		    zfs_range_tree_add, msp->ms_freed);
 	}
 	msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree);
 	zfs_range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 	    & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
 	/*
 	 * Verify that the space map object ID has been recorded in the
 	 * vdev_ms_array.
 	 */
 	uint64_t object;
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
 	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
 	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
 
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
 
 static void
 metaslab_evict(metaslab_t *msp, uint64_t txg)
 {
 	if (!msp->ms_loaded || msp->ms_disabled != 0)
 		return;
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		VERIFY0(zfs_range_tree_space(
 		    msp->ms_allocating[(txg + t) & TXG_MASK]));
 	}
 	if (msp->ms_allocator != -1)
 		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
 
 	if (!metaslab_debug_unload)
 		metaslab_unload(msp);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	zfs_range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	if (msp->ms_new) {
 		/* this is a new metaslab, add its capacity to the vdev */
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 
 		/* there should be no allocations nor frees at this point */
 		VERIFY0(msp->ms_allocated_this_txg);
 		VERIFY0(zfs_range_tree_space(msp->ms_freed));
 	}
 
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 
 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing ||
 	    vd->vdev_rz_expanding) {
 		defer_allowed = B_FALSE;
 	}
 
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    zfs_range_tree_space(msp->ms_freed);
 
 	if (defer_allowed) {
 		defer_delta = zfs_range_tree_space(msp->ms_freed) -
 		    zfs_range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= zfs_range_tree_space(*defer_tree);
 	}
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
 	if (spa_syncing_log_sm(spa) == NULL) {
 		/*
 		 * If there's a metaslab_load() in progress and we don't have
 		 * a log space map, it means that we probably wrote to the
 		 * metaslab's space map. If this is the case, we need to
 		 * make sure that we wait for the load to complete so that we
 		 * have a consistent view at the in-core side of the metaslab.
 		 */
 		metaslab_load_wait(msp);
 	} else {
 		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 	}
 
 	/*
 	 * When auto-trimming is enabled, free ranges which are added to
 	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
 	 * periodically consumed by the vdev_autotrim_thread() which issues
 	 * trims for all ranges and then vacates the tree.  The ms_trim tree
 	 * can be discarded at any time with the sole consequence of recent
 	 * frees not being trimmed.
 	 */
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
 		zfs_range_tree_walk(*defer_tree, zfs_range_tree_add,
 		    msp->ms_trim);
 		if (!defer_allowed) {
 			zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add,
 			    msp->ms_trim);
 		}
 	} else {
 		zfs_range_tree_vacate(msp->ms_trim, NULL, NULL);
 	}
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
 	 * the defer_tree -- this is safe to do because we've
 	 * just emptied out the defer_tree.
 	 */
 	zfs_range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
 		zfs_range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
 		zfs_range_tree_vacate(msp->ms_freed,
 		    msp->ms_loaded ? zfs_range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
 
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 	metaslab_aux_histograms_update_done(msp, defer_allowed);
 
 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
 		mutex_enter(&mg->mg_lock);
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
 
 	/*
 	 * Re-sort metaslab within its group now that we've adjusted
 	 * its allocatable space.
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
 	ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(zfs_range_tree_space(msp->ms_freeing));
 	ASSERT0(zfs_range_tree_space(msp->ms_freed));
 	ASSERT0(zfs_range_tree_space(msp->ms_checkpointing));
 	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 	metaslab_group_alloc_update(mg);
 
 	/*
 	 * Preload the next potential metaslabs but only on active
 	 * metaslab groups. We can get into a state where the metaslab
 	 * is no longer active since we dirty metaslabs as we remove a
 	 * a device, thus potentially making the metaslab group eligible
 	 * for preloading.
 	 */
 	if (mg->mg_activation_count > 0) {
 		metaslab_group_preload(mg);
 	}
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 }
 
 /*
  * When writing a ditto block (i.e. more than one DVA for a given BP) on
  * the same vdev as an existing DVA of this BP, then try to allocate it
  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
  */
 static boolean_t
 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t dva_ms_id;
 
 	if (DVA_GET_ASIZE(dva) == 0)
 		return (B_TRUE);
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (B_TRUE);
 
 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 
 	return (msp->ms_id != dva_ms_id);
 }
 
 /*
  * ==========================================================================
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
 
 /*
  * Add an allocation trace element to the allocation tracing list.
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
     int allocator)
 {
 	metaslab_alloc_trace_t *mat;
 
 	if (!metaslab_trace_enabled)
 		return;
 
 	/*
 	 * When the tracing list reaches its maximum we remove
 	 * the second element in the list before adding a new one.
 	 * By removing the second element we preserve the original
 	 * entry as a clue to what allocations steps have already been
 	 * performed.
 	 */
 	if (zal->zal_size == metaslab_trace_max_entries) {
 		metaslab_alloc_trace_t *mat_next;
 #ifdef ZFS_DEBUG
 		panic("too many entries in allocation list");
 #endif
 		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 	}
 
 	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 	list_link_init(&mat->mat_list_node);
 	mat->mat_mg = mg;
 	mat->mat_msp = msp;
 	mat->mat_size = psize;
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
 	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
 
 	/*
 	 * The list is part of the zio so locking is not required. Only
 	 * a single thread will perform allocations for a given zio.
 	 */
 	list_insert_tail(&zal->zal_list, mat);
 	zal->zal_size++;
 
 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 }
 
+void
+metaslab_trace_move(zio_alloc_list_t *old, zio_alloc_list_t *new)
+{
+	ASSERT0(new->zal_size);
+	list_move_tail(&new->zal_list, &old->zal_list);
+	new->zal_size = old->zal_size;
+	list_destroy(&old->zal_list);
+}
+
 void
 metaslab_trace_init(zio_alloc_list_t *zal)
 {
 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
 	zal->zal_size = 0;
 }
 
 void
 metaslab_trace_fini(zio_alloc_list_t *zal)
 {
 	metaslab_alloc_trace_t *mat;
 
 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 static void
 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, int allocator,
     int flags, uint64_t psize, const void *tag)
 {
-	if (!(flags & METASLAB_ASYNC_ALLOC))
+	if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag);
 }
 
+void
+metaslab_group_alloc_increment_all(spa_t *spa, blkptr_t *bp, int allocator,
+    int flags, uint64_t psize, const void *tag)
+{
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]);
+		metaslab_group_alloc_increment(spa, vdev, allocator, flags,
+		    psize, tag);
+	}
+}
+
 void
 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, int allocator,
     int flags, uint64_t psize, const void *tag)
 {
-	if (!(flags & METASLAB_ASYNC_ALLOC))
+	if (!(flags & METASLAB_ASYNC_ALLOC) || tag == NULL)
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 	(void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag);
 }
 
 static uint64_t
-metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t max_size,
+    uint64_t txg, uint64_t *actual_size)
 {
 	uint64_t start;
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_disabled);
 	VERIFY0(msp->ms_new);
 
-	start = mc->mc_ops->msop_alloc(msp, size);
+	start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size);
 	if (start != -1ULL) {
+		size = *actual_size;
 		metaslab_group_t *mg = msp->ms_group;
 		vdev_t *vd = mg->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size);
 		zfs_range_tree_remove(rt, start, size);
 		zfs_range_tree_clear(msp->ms_trim, start, size);
 
 		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start,
 		    size);
 		msp->ms_allocating_total += size;
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
 	/*
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
 	msp->ms_max_size = metaslab_largest_allocatable(msp);
 	return (start);
 }
 
 /*
  * Find the metaslab with the highest weight that is less than what we've
  * already tried.  In the common case, this means that we will examine each
  * metaslab at most once. Note that concurrent callers could reorder metaslabs
  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
  * activated by another thread, and we fail to allocate from the metaslab we
  * have selected, we may not try the newly-activated metaslab, and instead
  * activate another metaslab.  This is not optimal, but generally does not cause
  * any problems (a possible exception being if every metaslab is completely full
  * except for the newly-activated metaslab which we fail to examine).
  */
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, uint64_t asize, int allocator,
     boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
     boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	metaslab_t *msp = avl_find(t, search, &idx);
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
 	uint_t tries = 0;
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 
 		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
 			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
 			return (NULL);
 		}
 		tries++;
 
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
 		}
 
 		/*
 		 * If the selected metaslab is condensing or disabled, or
 		 * hasn't gone through a metaslab_sync_done(), then skip it.
 		 */
 		if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
 		/*
 		 * If we're activating as primary, this is our first allocation
 		 * from this disk, so we don't need to check how close we are.
 		 * If the metaslab under consideration was already active,
 		 * we're getting desperate enough to steal another allocator's
 		 * metaslab, so we still don't care about distances.
 		 */
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 			break;
 
 		if (!try_hard) {
 			for (i = 0; i < d; i++) {
 				if (!metaslab_is_unique(msp, &dva[i]))
 					break;  /* try another metaslab */
 			}
 			if (i == d)
 				break;
 		}
 	}
 
 	if (msp != NULL) {
 		search->ms_weight = msp->ms_weight;
 		search->ms_start = msp->ms_start + 1;
 		search->ms_allocator = msp->ms_allocator;
 		search->ms_primary = msp->ms_primary;
 	}
 	return (msp);
 }
 
 static void
 metaslab_active_mask_verify(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
 		return;
 
 	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
 		VERIFY3S(msp->ms_allocator, !=, -1);
 		VERIFY(!msp->ms_primary);
 		return;
 	}
 
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		VERIFY3S(msp->ms_allocator, ==, -1);
 		return;
 	}
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, dva_t *dva, int d, int allocator,
-    boolean_t try_hard)
+    uint64_t asize, uint64_t max_asize, uint64_t txg,
+    dva_t *dva, int d, int allocator, boolean_t try_hard,
+    uint64_t *actual_asize)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 
 	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_CLAIM;
 			break;
 		}
 	}
 
 	/*
 	 * If we don't have enough metaslabs active, we just use the 0th slot.
 	 */
 	if (allocator >= mg->mg_ms_ready / 3)
 		allocator = 0;
 	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
 	/*
 	 * At the end of the metaslab tree are the already-active metaslabs,
 	 * first the primaries, then the secondaries. When we resume searching
 	 * through the tree, we need to consider ms_allocator and ms_primary so
 	 * we start in the location right after where we left off, and don't
 	 * accidentally loop forever considering the same metaslabs.
 	 */
 	search->ms_allocator = -1;
 	search->ms_primary = B_TRUE;
 	for (;;) {
 		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mga->mga_primary != NULL) {
 			msp = mga->mga_primary;
 
 			/*
 			 * Even though we don't hold the ms_lock for the
 			 * primary metaslab, those fields should not
 			 * change while we hold the mg_lock. Thus it is
 			 * safe to make assertions on them.
 			 */
 			ASSERT(msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mga->mga_secondary != NULL) {
 			msp = mga->mga_secondary;
 
 			/*
 			 * See comment above about the similar assertions
 			 * for the primary metaslab.
 			 */
 			ASSERT(!msp->ms_primary);
 			ASSERT3S(msp->ms_allocator, ==, allocator);
 			ASSERT(msp->ms_loaded);
 
 			was_active = B_TRUE;
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
 			    asize, allocator, try_hard, zal, search,
 			    &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL)
 			break;
 		mutex_enter(&msp->ms_lock);
 
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE3(ms__activation__attempt,
 		    metaslab_t *, msp, uint64_t, activation_weight,
 		    boolean_t, was_active);
 #endif
 
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
 		 * active status first to see if we need to set_selected_txg
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If the metaslab was activated for another allocator
 		 * while we were waiting in the ms_lock above, or it's
 		 * a primary and we're seeking a secondary (or vice versa),
 		 * we go back and select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			ASSERT(msp->ms_loaded);
 			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
 			    msp->ms_allocator != -1);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * This metaslab was used for claiming regions allocated
 		 * by the ZIL during pool import. Once these regions are
 		 * claimed we don't need to keep the CLAIM bit set
 		 * anymore. Passivate this metaslab to zero its activation
 		 * mask.
 		 */
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
 			ASSERT(msp->ms_loaded);
 			ASSERT3S(msp->ms_allocator, ==, -1);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		metaslab_set_selected_txg(msp, txg);
 
 		int activation_error =
 		    metaslab_activate(msp, allocator, activation_weight);
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * If the metaslab was activated by another thread for
 		 * another allocator or activation_weight (EBUSY), or it
 		 * failed because another metaslab was assigned as primary
 		 * for this allocator (EEXIST) we continue using this
 		 * metaslab for our allocation, rather than going on to a
 		 * worse metaslab (we waited for that metaslab to be loaded
 		 * after all).
 		 *
 		 * If the activation failed due to an I/O error or ENOSPC we
 		 * skip to the next metaslab.
 		 */
 		boolean_t activated;
 		if (activation_error == 0) {
 			activated = B_TRUE;
 		} else if (activation_error == EBUSY ||
 		    activation_error == EEXIST) {
 			activated = B_FALSE;
 		} else {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
 		 * the metaslab is now loaded so metaslab_should_allocate()
 		 * can accurately determine if the allocation attempt should
 		 * proceed.
 		 */
 		if (!metaslab_should_allocate(msp, asize, try_hard)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again
 		 * as we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_disabled > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_DISABLED, allocator);
 			if (activated) {
 				metaslab_passivate(msp, msp->ms_weight &
 				    ~METASLAB_ACTIVE_MASK);
 			}
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		offset = metaslab_block_alloc(msp, asize, txg);
-		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
+		offset = metaslab_block_alloc(msp, asize, max_asize, txg,
+		    actual_asize);
 
 		if (offset != -1ULL) {
+			metaslab_trace_add(zal, mg, msp, *actual_asize, d,
+			    offset, allocator);
 			/* Proactively passivate the metaslab, if needed */
 			if (activated)
 				metaslab_segment_may_passivate(msp);
 			mutex_exit(&msp->ms_lock);
 			break;
 		}
+		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * This code is disabled out because of issues with
 		 * tracepoints in non-gpl kernel modules.
 		 */
 #if 0
 		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
 		    uint64_t, asize);
 #endif
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
 		 * selector.
 		 *
 		 * For space-based metaslabs, we use the maximum block size.
 		 * This information is only available when the metaslab
 		 * is loaded and is more accurate than the generic free
 		 * space weight that was calculated by metaslab_weight().
 		 * This information allows us to quickly compare the maximum
 		 * available allocation in the metaslab to the allocation
 		 * size being requested.
 		 *
 		 * For segment-based metaslabs, determine the new weight
 		 * based on the highest bucket in the range tree. We
 		 * explicitly use the loaded segment weight (i.e. the range
 		 * tree histogram) since it contains the space that is
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
 		uint64_t weight;
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 			weight = metaslab_largest_allocatable(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 		} else {
 			weight = metaslab_weight_from_range_tree(msp);
 		}
 
 		if (activated) {
 			metaslab_passivate(msp, weight);
 		} else {
 			/*
 			 * For the case where we use the metaslab that is
 			 * active for another allocator we want to make
 			 * sure that we retain the activation mask.
 			 *
 			 * Note that we could attempt to use something like
 			 * metaslab_recalculate_weight_and_sort() that
 			 * retains the activation mask here. That function
 			 * uses metaslab_weight() to set the weight though
 			 * which is not as accurate as the calculations
 			 * above.
 			 */
 			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
 			metaslab_group_sort(mg, msp, weight);
 		}
 		metaslab_active_mask_verify(msp);
 
 		/*
 		 * We have just failed an allocation attempt, check
 		 * that metaslab_should_allocate() agrees. Otherwise,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
 		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
 
 		mutex_exit(&msp->ms_lock);
 	}
 	kmem_free(search, sizeof (*search));
 
 	if (offset == -1ULL) {
 		metaslab_trace_add(zal, mg, NULL, asize, d,
 		    TRACE_GROUP_FAILURE, allocator);
 		if (asize <= vdev_get_min_alloc(mg->mg_vd)) {
 			/*
 			 * This metaslab group was unable to allocate
 			 * the minimum block size so it must be out of
 			 * space.  Notify the allocation throttle to
 			 * skip allocation attempts to this group until
 			 * more space becomes available.
 			 */
 			mg->mg_no_free_space = B_TRUE;
 		}
 	}
 	return (offset);
 }
 
 static boolean_t
 metaslab_group_allocatable(spa_t *spa, metaslab_group_t *mg, uint64_t psize,
     int d, int flags, boolean_t try_hard, zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_t *vd = mg->mg_vd;
 	boolean_t allocatable;
 
 	/*
 	 * Don't allocate from faulted devices.
 	 */
 	if (try_hard)
 		spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 	allocatable = vdev_allocatable(vd);
 	if (try_hard)
 		spa_config_exit(spa, SCL_ZIO, FTAG);
 	if (!allocatable) {
 		metaslab_trace_add(zal, mg, NULL, psize, d,
 		    TRACE_NOT_ALLOCATABLE, allocator);
 		return (B_FALSE);
 	}
 
 	if (!try_hard) {
 		/*
 		 * Avoid vdevs with too little space or too fragmented.
 		 */
 		if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space ||
 		    (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_NOT_ALLOCATABLE, allocator);
 			return (B_FALSE);
 		}
 
 		/*
 		 * Avoid writing single-copy data to an unhealthy,
 		 * non-redundant vdev.
 		 */
 		if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY &&
 		    vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
 			return (B_FALSE);
 		}
 	}
 
 	return (B_TRUE);
 }
 
-/*
- * Allocate a block for the specified i/o.
- */
-int
-metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
-    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
-    zio_alloc_list_t *zal, int allocator)
+static int
+metaslab_alloc_dva_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    uint64_t max_psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg,
+    int flags, zio_alloc_list_t *zal, int allocator, uint64_t *actual_psize)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	metaslab_group_t *mg = NULL, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 * This will result in more split blocks when using device removal,
 	 * and a large number of split blocks coupled with ztest-induced
 	 * damage can result in extremely long reconstruction times.  This
 	 * will also test spilling from special to normal.
 	 */
 	if (psize >= metaslab_force_ganging &&
 	    metaslab_force_ganging_pct > 0 &&
 	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
+	if (max_psize > psize && max_psize >= metaslab_force_ganging &&
+	    metaslab_force_ganging_pct > 0 &&
+	    (random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
+		max_psize = MAX((psize + max_psize) / 2,
+		    metaslab_force_ganging);
+	}
+	ASSERT3U(psize, <=, max_psize);
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mca_rotor or mca_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  It makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva && DVA_IS_VALID(&hintdva[d])) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 		mg = vdev_get_mg(vd, mc);
 	}
 	if (mg == NULL && d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vdev_get_mg(vd, mc)->mg_next;
 	}
 	if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) {
 		ASSERT(mca->mca_rotor != NULL);
 		mg = mca->mca_rotor;
 	}
 
 	rotor = mg;
 top:
 	do {
 		ASSERT(mg->mg_activation_count == 1);
 		ASSERT(mg->mg_class == mc);
 
 		if (!metaslab_group_allocatable(spa, mg, psize, d, flags,
 		    try_hard, zal, allocator))
 			goto next;
 
 		vd = mg->mg_vd;
 		uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
-		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
-		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-		    dva, d, allocator, try_hard);
+		ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
+		uint64_t max_asize = vdev_psize_to_asize_txg(vd, max_psize,
+		    txg);
+		ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift));
+		uint64_t offset = metaslab_group_alloc(mg, zal, asize,
+		    max_asize, txg, dva, d, allocator, try_hard,
+		    &asize);
 
 		if (offset != -1ULL) {
+			if (actual_psize)
+				*actual_psize = vdev_asize_to_psize_txg(vd,
+				    asize, txg);
 			metaslab_class_rotate(mg, allocator, psize, B_TRUE);
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d],
 			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
 			DVA_SET_ASIZE(&dva[d], asize);
 			return (0);
 		}
 next:
 		metaslab_class_rotate(mg, allocator, psize, B_FALSE);
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
 	 * If we haven't tried hard, perhaps do so now.
 	 */
 	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
 	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
 	    psize <= spa->spa_min_alloc)) {
 		METASLABSTAT_BUMP(metaslabstat_try_hard);
 		try_hard = B_TRUE;
 		goto top;
 	}
 
 	memset(&dva[d], 0, sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+    zio_alloc_list_t *zal, int allocator)
+{
+	return (metaslab_alloc_dva_range(spa, mc, psize, psize, dva, d, hintdva,
+	    txg, flags, zal, allocator, NULL));
+}
+
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
     boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int m = offset >> vd->vdev_ms_shift;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	VERIFY3U(m, <, vd->vdev_ms_count);
 
 	msp = vd->vdev_ms[m];
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
 
 	mutex_enter(&msp->ms_lock);
 	if (zfs_range_tree_is_empty(msp->ms_freeing) &&
 	    zfs_range_tree_is_empty(msp->ms_checkpointing)) {
 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 	}
 
 	if (checkpoint) {
 		ASSERT(spa_has_checkpoint(spa));
 		zfs_range_tree_add(msp->ms_checkpointing, offset, asize);
 	} else {
 		zfs_range_tree_add(msp->ms_freeing, offset, asize);
 	}
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
 		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
 		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
     boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
 		 * we complete the removal, we first change the vdev to be
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
 		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_free_impl_cb, &checkpoint);
 	} else {
 		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
 typedef struct remap_blkptr_cb_arg {
 	blkptr_t *rbca_bp;
 	spa_remap_cb_t rbca_cb;
 	vdev_t *rbca_remap_vd;
 	uint64_t rbca_remap_offset;
 	void *rbca_cb_arg;
 } remap_blkptr_cb_arg_t;
 
 static void
 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	remap_blkptr_cb_arg_t *rbca = arg;
 	blkptr_t *bp = rbca->rbca_bp;
 
 	/* We can not remap split blocks. */
 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 		return;
 	ASSERT0(inner_offset);
 
 	if (rbca->rbca_cb != NULL) {
 		/*
 		 * At this point we know that we are not handling split
 		 * blocks and we invoke the callback on the previous
 		 * vdev which must be indirect.
 		 */
 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 
 		/* set up remap_blkptr_cb_arg for the next call */
 		rbca->rbca_remap_vd = vd;
 		rbca->rbca_remap_offset = offset;
 	}
 
 	/*
 	 * The phys birth time is that of dva[0].  This ensures that we know
 	 * when each dva was written, so that resilver can determine which
 	 * blocks need to be scrubbed (i.e. those written during the time
 	 * the vdev was offline).  It also ensures that the key used in
 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 	 * we didn't change the phys_birth, a lookup in the ARC for a
 	 * remapped BP could find the data that was previously stored at
 	 * this vdev + offset.
 	 */
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
 	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 }
 
 /*
  * If the block pointer contains any indirect DVAs, modify them to refer to
  * concrete DVAs.  Note that this will sometimes not be possible, leaving
  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
  * segments in the mapping (i.e. it is a "split block").
  *
  * If the BP was remapped, calls the callback on the original dva (note the
  * callback can be called multiple times if the original indirect DVA refers
  * to another indirect DVA, etc).
  *
  * Returns TRUE if the BP was remapped.
  */
 boolean_t
 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 {
 	remap_blkptr_cb_arg_t rbca;
 
 	if (!zfs_remap_blkptr_enable)
 		return (B_FALSE);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 		return (B_FALSE);
 
 	/*
 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
 	 */
 	if (BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	/*
 	 * Gang blocks can not be remapped, because
 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 	 * the BP used to read the gang block header (GBH) being the same
 	 * as the DVA[0] that we allocated for the GBH.
 	 */
 	if (BP_IS_GANG(bp))
 		return (B_FALSE);
 
 	/*
 	 * Embedded BP's have no DVA to remap.
 	 */
 	if (BP_GET_NDVAS(bp) < 1)
 		return (B_FALSE);
 
 	/*
 	 * Cloned blocks can not be remapped since BRT depends on specific
 	 * vdev id and offset in the DVA[0] for its reference counting.
 	 */
 	if (!BP_IS_METADATA(bp) && brt_maybe_exists(spa, bp))
 		return (B_FALSE);
 
 	/*
 	 * Note: we only remap dva[0].  If we remapped other dvas, we
 	 * would no longer know what their phys birth txg is.
 	 */
 	dva_t *dva = &bp->blk_dva[0];
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops->vdev_op_remap == NULL)
 		return (B_FALSE);
 
 	rbca.rbca_bp = bp;
 	rbca.rbca_cb = callback;
 	rbca.rbca_remap_vd = vd;
 	rbca.rbca_remap_offset = offset;
 	rbca.rbca_cb_arg = arg;
 
 	/*
 	 * remap_blkptr_cb() will be called in order for each level of
 	 * indirection, until a concrete vdev is reached or a split block is
 	 * encountered. old_vd and old_offset are updated within the callback
 	 * as we go from the one indirect vdev to the next one (either concrete
 	 * or indirect again) in that order.
 	 */
 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 
 	/* Check if the DVA wasn't remapped because it is a split block */
 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Undo the allocation of a DVA which happened in the given transaction group.
  */
 void
 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	metaslab_t *msp;
 	vdev_t *vd;
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset,
 		    (u_longlong_t)size);
 		return;
 	}
 
 	ASSERT(!vd->vdev_removing);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 	msp->ms_allocating_total -= size;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 	VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	zfs_range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Free the block represented by the given DVA.
  */
 void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (DVA_GET_GANG(dva)) {
 		size = vdev_gang_header_asize(vd);
 	}
 
 	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
  * Reserve some allocation slots. The reservation system must be called
  * before we call into the allocator. If there aren't any available slots
  * then the I/O will be throttled until an I/O completes and its slots are
  * freed up. The function returns true if it was successful in placing
  * the reservation.
  */
 boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
     boolean_t must, boolean_t *more)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	if (mc->mc_alloc_io_size < zio->io_size) {
 		mc->mc_alloc_io_size = zio->io_size;
 		metaslab_class_balance(mc, B_FALSE);
 	}
 	if (must || mca->mca_reserved <= mc->mc_alloc_max) {
 		/*
 		 * The potential race between compare and add is covered by the
 		 * allocator lock in most cases, or irrelevant due to must set.
 		 * But even if we assume some other non-existing scenario, the
 		 * worst that can happen is few more I/Os get to allocation
 		 * earlier, that is not a problem.
 		 */
 		int64_t delta = slots * zio->io_size;
 		*more = (atomic_add_64_nv(&mca->mca_reserved, delta) <=
 		    mc->mc_alloc_max);
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		return (B_TRUE);
 	}
 	*more = B_FALSE;
 	return (B_FALSE);
 }
 
 boolean_t
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     zio_t *zio)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	int64_t delta = slots * zio->io_size;
 	return (atomic_add_64_nv(&mca->mca_reserved, -delta) <=
 	    mc->mc_alloc_max);
 }
 
 static int
 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
     uint64_t txg)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3P(vd->vdev_ms, !=, NULL);
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 		if (error == EBUSY) {
 			ASSERT(msp->ms_loaded);
 			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 			error = 0;
 		}
 	}
 
 	if (error == 0 &&
 	    !zfs_range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	zfs_range_tree_remove(msp->ms_allocatable, offset, size);
 	zfs_range_tree_clear(msp->ms_trim, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
 		metaslab_class_t *mc = msp->ms_group->mg_class;
 		multilist_sublist_t *mls =
 		    multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp);
 		if (!multilist_link_active(&msp->ms_class_txg_node)) {
 			msp->ms_selected_txg = txg;
 			multilist_sublist_insert_head(mls, msp);
 		}
 		multilist_sublist_unlock(mls);
 
 		if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
 		msp->ms_allocating_total += size;
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 typedef struct metaslab_claim_cb_arg_t {
 	uint64_t	mcca_txg;
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset;
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 		    size, mcca_arg->mcca_txg);
 	}
 }
 
 int
 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 {
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		metaslab_claim_cb_arg_t arg;
 
 		/*
 		 * Only zdb(8) can claim on indirect vdevs.  This is used
 		 * to detect leaks of mapped space (that are not accounted
 		 * for in the obsolete counts, spacemap, or bpobj).
 		 */
 		ASSERT(!spa_writeable(vd->vdev_spa));
 		arg.mcca_error = 0;
 		arg.mcca_txg = txg;
 
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_claim_impl_cb, &arg);
 
 		if (arg.mcca_error == 0) {
 			arg.mcca_error = metaslab_claim_concrete(vd,
 			    offset, size, txg);
 		}
 		return (arg.mcca_error);
 	} else {
 		return (metaslab_claim_concrete(vd, offset, size, txg));
 	}
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_gang_header_asize(vd);
 
 	return (metaslab_claim_impl(vd, offset, size, txg));
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
     zio_alloc_list_t *zal, int allocator, const void *tag)
+{
+	return (metaslab_alloc_range(spa, mc, psize, psize, bp, ndvas, txg,
+	    hintbp, flags, zal, allocator, tag, NULL));
+}
+
+int
+metaslab_alloc_range(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    uint64_t max_psize, blkptr_t *bp, int ndvas, uint64_t txg,
+    blkptr_t *hintbp, int flags, zio_alloc_list_t *zal, int allocator,
+    const void *tag, uint64_t *actual_psize)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
 	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
 	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
 		/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
+	uint64_t cur_psize = 0;
+
 	for (int d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-		    txg, flags, zal, allocator);
+		error = metaslab_alloc_dva_range(spa, mc, psize, max_psize,
+		    dva, d, hintdva, txg, flags, zal, allocator,
+		    actual_psize ? &cur_psize : NULL);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), allocator, flags,
 				    psize, tag);
 				memset(&dva[d], 0, sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		} else {
 			/*
 			 * Update the metaslab group's queue depth
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), allocator, flags, psize,
 			    tag);
+			if (actual_psize)
+				max_psize = MIN(cur_psize, max_psize);
 		}
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
+	if (actual_psize)
+		*actual_psize = max_psize;
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, 0);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
 	 * the blocks that we free that are part of the checkpoint won't be
 	 * reused until the checkpoint is discarded or we revert to it.
 	 *
 	 * The checkpoint flag is passed down the metaslab_free code path
 	 * and is set whenever we want to add a block to the checkpoint's
 	 * accounting. That is, we "checkpoint" blocks that existed at the
 	 * time the checkpoint was created and are therefore referenced by
 	 * the checkpointed uberblock.
 	 *
 	 * Note that, we don't checkpoint any blocks if the current
 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
 	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
 		 * there is no way it was created in the current txg.
 		 */
 		ASSERT(!now);
 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
 		checkpoint = B_TRUE;
 	}
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
 			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_claim_dva(spa, &dva[d], txg);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner, (void) arg;
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	metaslab_check_free_impl(vd, offset, size);
 }
 
 static void
 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	metaslab_t *msp;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_check_free_impl_cb, NULL);
 		return;
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded) {
 		zfs_range_tree_verify_not_present(msp->ms_allocatable,
 		    offset, size);
 	}
 
 	/*
 	 * Check all segments that currently exist in the freeing pipeline.
 	 *
 	 * It would intuitively make sense to also check the current allocating
 	 * tree since metaslab_unalloc_dva() exists for extents that are
 	 * allocated and freed in the same sync pass within the same txg.
 	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
 	 * segment but then we free part of it within the same txg
 	 * [see zil_sync()]. Thus, we don't call zfs_range_tree_verify() in the
 	 * current allocating tree.
 	 */
 	zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	zfs_range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		zfs_range_tree_verify_not_present(msp->ms_defer[j], offset,
 		    size);
 	zfs_range_tree_verify_not_present(msp->ms_trim, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 
 		if (DVA_GET_GANG(&bp->blk_dva[i]))
 			size = vdev_gang_header_asize(vd);
 
 		ASSERT3P(vd, !=, NULL);
 
 		metaslab_check_free_impl(vd, offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
 
 static void
 metaslab_group_disable_wait(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	while (mg->mg_disabled_updating) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 }
 
 static void
 metaslab_group_disabled_increment(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
 	ASSERT(mg->mg_disabled_updating);
 
 	while (mg->mg_ms_disabled >= max_disabled_ms) {
 		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
 	}
 	mg->mg_ms_disabled++;
 	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
 }
 
 /*
  * Mark the metaslab as disabled to prevent any allocations on this metaslab.
  * We must also track how many metaslabs are currently disabled within a
  * metaslab group and limit them to prevent allocation failures from
  * occurring because all metaslabs are disabled.
  */
 void
 metaslab_disable(metaslab_t *msp)
 {
 	ASSERT(!MUTEX_HELD(&msp->ms_lock));
 	metaslab_group_t *mg = msp->ms_group;
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 
 	/*
 	 * To keep an accurate count of how many threads have disabled
 	 * a specific metaslab group, we only allow one thread to mark
 	 * the metaslab group at a time. This ensures that the value of
 	 * ms_disabled will be accurate when we decide to mark a metaslab
 	 * group as disabled. To do this we force all other threads
 	 * to wait till the metaslab's mg_disabled_updating flag is no
 	 * longer set.
 	 */
 	metaslab_group_disable_wait(mg);
 	mg->mg_disabled_updating = B_TRUE;
 	if (msp->ms_disabled == 0) {
 		metaslab_group_disabled_increment(mg);
 	}
 	mutex_enter(&msp->ms_lock);
 	msp->ms_disabled++;
 	mutex_exit(&msp->ms_lock);
 
 	mg->mg_disabled_updating = B_FALSE;
 	cv_broadcast(&mg->mg_ms_disabled_cv);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	spa_t *spa = mg->mg_vd->vdev_spa;
 
 	/*
 	 * Wait for the outstanding IO to be synced to prevent newly
 	 * allocated blocks from being overwritten.  This used by
 	 * initialize and TRIM which are modifying unallocated space.
 	 */
 	if (sync)
 		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	mutex_enter(&mg->mg_ms_disabled_lock);
 	mutex_enter(&msp->ms_lock);
 	if (--msp->ms_disabled == 0) {
 		mg->mg_ms_disabled--;
 		cv_broadcast(&mg->mg_ms_disabled_cv);
 		if (unload)
 			metaslab_unload(msp);
 	}
 	mutex_exit(&msp->ms_lock);
 	mutex_exit(&mg->mg_ms_disabled_lock);
 }
 
 void
 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty)
 {
 	ms->ms_unflushed_dirty = dirty;
 }
 
 static void
 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
 {
 	vdev_t *vd = ms->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
 
 	metaslab_unflushed_phys_t entry = {
 		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
 	};
 	uint64_t entry_size = sizeof (entry);
 	uint64_t entry_offset = ms->ms_id * entry_size;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 	    &object);
 	if (err == ENOENT) {
 		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
 		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
 		VERIFY0(zap_add(mos, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
 		    &object, tx));
 	} else {
 		VERIFY0(err);
 	}
 
 	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
 	    &entry, tx);
 }
 
 void
 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
 {
 	ms->ms_unflushed_txg = txg;
 	metaslab_update_ondisk_flush_data(ms, tx);
 }
 
 boolean_t
 metaslab_unflushed_dirty(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_dirty);
 }
 
 uint64_t
 metaslab_unflushed_txg(metaslab_t *ms)
 {
 	return (ms->ms_unflushed_txg);
 }
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, U64, ZMOD_RW,
 	"Allocation granularity (a.k.a. stripe size)");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
 	"Load all metaslabs when pool is first opened");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 	"Prevent metaslabs from being unloaded");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
 	"Max number of metaslabs per group to preload");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, UINT, ZMOD_RW,
 	"Delay in milliseconds after metaslab was last used before unloading");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be free to make it "
 	"eligible for allocation");
 
 ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, UINT, ZMOD_RW,
 	"Percentage of metaslab group size that should be considered eligible "
 	"for allocations unless all metaslab groups within the metaslab class "
 	"have also crossed this threshold");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT,
 	ZMOD_RW,
 	"Use the fragmentation metric to prefer less fragmented metaslabs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, UINT,
 	ZMOD_RW, "Fragmentation for metaslab to allow allocation");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
 	"Prefer metaslabs with lower LBAs");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
 	"Enable space-based metaslab group biasing");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, perf_bias, INT, ZMOD_RW,
 	"Enable performance-based metaslab group biasing");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
 	ZMOD_RW, "Enable segment-based metaslab selection");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
 	"Segment-based metaslab selection maximum buckets before switching");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
 	"Blocks larger than this size are sometimes forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
 	"Percentage of large blocks that will be forced to be gang blocks");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
 	"Max distance (bytes) to search forward before using size tree");
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
 	"When looking in size tree, use largest segment instead of exact fit");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, U64,
 	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, UINT, ZMOD_RW,
 	"Percentage of memory that can be used to store metaslab range trees");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
 	ZMOD_RW, "Try hard to allocate before ganging");
 
 ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, UINT, ZMOD_RW,
 	"Normally only consider this many of the best metaslabs in each vdev");
 
 ZFS_MODULE_PARAM_CALL(zfs, zfs_, active_allocator,
 	param_set_active_allocator, param_get_charp, ZMOD_RW,
 	"SPA active allocator");
diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c
index 9cd0cfbcf8c2..aa2902d0b84e 100644
--- a/module/zfs/refcount.c
+++ b/module/zfs/refcount.c
@@ -1,358 +1,359 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2021 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_refcount.h>
 
 #ifdef	ZFS_DEBUG
 /*
  * Reference count tracking is disabled by default.  It's memory requirements
  * are reasonable, however as implemented it consumes a significant amount of
  * cpu time.  Until its performance is improved it should be manually enabled.
  */
 int reference_tracking_enable = B_FALSE;
 static uint_t reference_history = 3; /* tunable */
 
 static kmem_cache_t *reference_cache;
 
 void
 zfs_refcount_init(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zfs_refcount_fini(void)
 {
 	kmem_cache_destroy(reference_cache);
 }
 
 static int
 zfs_refcount_compare(const void *x1, const void *x2)
 {
 	const reference_t *r1 = (const reference_t *)x1;
 	const reference_t *r2 = (const reference_t *)x2;
 
 	int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder);
 	int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number);
 	int cmp = cmp1 ? cmp1 : cmp2;
 	return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2));
 }
 
 void
 zfs_refcount_create(zfs_refcount_t *rc)
 {
 	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.a));
 	list_create(&rc->rc_removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.l));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
 	rc->rc_tracked = reference_tracking_enable;
 }
 
 void
 zfs_refcount_create_tracked(zfs_refcount_t *rc)
 {
 	zfs_refcount_create(rc);
 	rc->rc_tracked = B_TRUE;
 }
 
 void
 zfs_refcount_create_untracked(zfs_refcount_t *rc)
 {
 	zfs_refcount_create(rc);
 	rc->rc_tracked = B_FALSE;
 }
 
 void
 zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
 {
 	reference_t *ref;
 	void *cookie = NULL;
 
 	ASSERT3U(rc->rc_count, ==, number);
 	while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL)
 		kmem_cache_free(reference_cache, ref);
 	avl_destroy(&rc->rc_tree);
 
 	while ((ref = list_remove_head(&rc->rc_removed)))
 		kmem_cache_free(reference_cache, ref);
 	list_destroy(&rc->rc_removed);
 	mutex_destroy(&rc->rc_mtx);
 }
 
 void
 zfs_refcount_destroy(zfs_refcount_t *rc)
 {
 	zfs_refcount_destroy_many(rc, 0);
 }
 
 int
 zfs_refcount_is_zero(zfs_refcount_t *rc)
 {
 	return (zfs_refcount_count(rc) == 0);
 }
 
 int64_t
 zfs_refcount_count(zfs_refcount_t *rc)
 {
 	return (atomic_load_64(&rc->rc_count));
 }
 
 int64_t
 zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	reference_t *ref;
 	int64_t count;
 
 	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, number);
 		ASSERT3U(count, >=, number);
 		return (count);
 	}
 
 	ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 	ref->ref_holder = holder;
 	ref->ref_number = number;
 	ref->ref_search = B_FALSE;
 	mutex_enter(&rc->rc_mtx);
 	avl_add(&rc->rc_tree, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
 
 	return (count);
 }
 
 int64_t
 zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
 {
 	return (zfs_refcount_add_many(rc, 1, holder));
 }
 
 void
 zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_add_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_add(rc, holder);
 }
 
 int64_t
 zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
     const void *holder)
 {
 	reference_t *ref, s;
 	int64_t count;
 
 	if (likely(!rc->rc_tracked)) {
 		count = atomic_add_64_nv(&(rc)->rc_count, -number);
 		ASSERT3S(count, >=, 0);
 		return (count);
 	}
 
 	s.ref_holder = holder;
 	s.ref_number = number;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ASSERT3U(rc->rc_count, >=, number);
 	ref = avl_find(&rc->rc_tree, &s, NULL);
 	if (unlikely(ref == NULL)) {
-		panic("No such hold %p on refcount %llx", holder,
+		PANIC("No such hold %llx on refcount %llx",
+		    (u_longlong_t)(uintptr_t)holder,
 		    (u_longlong_t)(uintptr_t)rc);
 		return (-1);
 	}
 	avl_remove(&rc->rc_tree, ref);
 	if (reference_history > 0) {
 		list_insert_head(&rc->rc_removed, ref);
 		if (rc->rc_removed_count >= reference_history) {
 			ref = list_remove_tail(&rc->rc_removed);
 			kmem_cache_free(reference_cache, ref);
 		} else {
 			rc->rc_removed_count++;
 		}
 	} else {
 		kmem_cache_free(reference_cache, ref);
 	}
 	rc->rc_count -= number;
 	count = rc->rc_count;
 	mutex_exit(&rc->rc_mtx);
 	return (count);
 }
 
 int64_t
 zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
 {
 	return (zfs_refcount_remove_many(rc, 1, holder));
 }
 
 void
 zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
 {
 	if (likely(!rc->rc_tracked))
 		(void) zfs_refcount_remove_many(rc, number, holder);
 	else for (; number > 0; number--)
 		(void) zfs_refcount_remove(rc, holder);
 }
 
 void
 zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
 {
 	avl_tree_t tree;
 	list_t removed;
 	reference_t *ref;
 	void *cookie = NULL;
 	uint64_t count;
 	uint_t removed_count;
 
 	avl_create(&tree, zfs_refcount_compare, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.a));
 	list_create(&removed, sizeof (reference_t),
 	    offsetof(reference_t, ref_link.l));
 
 	mutex_enter(&src->rc_mtx);
 	count = src->rc_count;
 	removed_count = src->rc_removed_count;
 	src->rc_count = 0;
 	src->rc_removed_count = 0;
 	avl_swap(&tree, &src->rc_tree);
 	list_move_tail(&removed, &src->rc_removed);
 	mutex_exit(&src->rc_mtx);
 
 	mutex_enter(&dst->rc_mtx);
 	dst->rc_count += count;
 	dst->rc_removed_count += removed_count;
 	if (avl_is_empty(&dst->rc_tree))
 		avl_swap(&dst->rc_tree, &tree);
 	else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL)
 		avl_add(&dst->rc_tree, ref);
 	list_move_tail(&dst->rc_removed, &removed);
 	mutex_exit(&dst->rc_mtx);
 
 	avl_destroy(&tree);
 	list_destroy(&removed);
 }
 
 void
 zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
     const void *current_holder, const void *new_holder)
 {
 	reference_t *ref, s;
 
 	if (likely(!rc->rc_tracked))
 		return;
 
 	s.ref_holder = current_holder;
 	s.ref_number = number;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ref = avl_find(&rc->rc_tree, &s, NULL);
 	ASSERT(ref);
 	ref->ref_holder = new_holder;
 	avl_update(&rc->rc_tree, ref);
 	mutex_exit(&rc->rc_mtx);
 }
 
 void
 zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
     const void *new_holder)
 {
 	return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder,
 	    new_holder));
 }
 
 /*
  * If tracking is enabled, return true if a reference exists that matches
  * the "holder" tag. If tracking is disabled, then return true if a reference
  * might be held.
  */
 boolean_t
 zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
 {
 	reference_t *ref, s;
 	avl_index_t idx;
 	boolean_t res;
 
 	if (likely(!rc->rc_tracked))
 		return (zfs_refcount_count(rc) > 0);
 
 	s.ref_holder = holder;
 	s.ref_number = 0;
 	s.ref_search = B_TRUE;
 	mutex_enter(&rc->rc_mtx);
 	ref = avl_find(&rc->rc_tree, &s, &idx);
 	if (likely(ref == NULL))
 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
 	res = ref && ref->ref_holder == holder;
 	mutex_exit(&rc->rc_mtx);
 	return (res);
 }
 
 /*
  * If tracking is enabled, return true if a reference does not exist that
  * matches the "holder" tag. If tracking is disabled, always return true
  * since the reference might not be held.
  */
 boolean_t
 zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
 {
 	reference_t *ref, s;
 	avl_index_t idx;
 	boolean_t res;
 
 	if (likely(!rc->rc_tracked))
 		return (B_TRUE);
 
 	mutex_enter(&rc->rc_mtx);
 	s.ref_holder = holder;
 	s.ref_number = 0;
 	s.ref_search = B_TRUE;
 	ref = avl_find(&rc->rc_tree, &s, &idx);
 	if (likely(ref == NULL))
 		ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER);
 	res = ref == NULL || ref->ref_holder != holder;
 	mutex_exit(&rc->rc_mtx);
 	return (res);
 }
 
 EXPORT_SYMBOL(zfs_refcount_create);
 EXPORT_SYMBOL(zfs_refcount_destroy);
 EXPORT_SYMBOL(zfs_refcount_is_zero);
 EXPORT_SYMBOL(zfs_refcount_count);
 EXPORT_SYMBOL(zfs_refcount_add);
 EXPORT_SYMBOL(zfs_refcount_remove);
 EXPORT_SYMBOL(zfs_refcount_held);
 
 ZFS_MODULE_PARAM(zfs, , reference_tracking_enable, INT, ZMOD_RW,
 	"Track reference holders to refcount_t objects");
 
 ZFS_MODULE_PARAM(zfs, , reference_history, UINT, ZMOD_RW,
 	"Maximum reference holders being tracked");
 #endif	/* ZFS_DEBUG */
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 9ac9a9fe608c..4fab60336078 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -1,6609 +1,6627 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2021, Klara Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 #include "zfs_prop.h"
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  * part of the spa_embedded_log_class.  The metaslab with the most free space
  * in each vdev is selected for this purpose when the pool is opened (or a
  * vdev is added).  See vdev_metaslab_init().
  *
  * Log blocks can be allocated from the following locations.  Each one is tried
  * in order until the allocation succeeds:
  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  * 2. embedded slog metaslabs (spa_embedded_log_class)
  * 3. other metaslabs in normal vdevs (spa_normal_class)
  *
  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  * than this number of metaslabs in the vdev.  This ensures that we don't set
  * aside an unreasonable amount of space for the ZIL.  If set to less than
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
 static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
 int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
 static unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit deadman "hung IO" events to this many per second.
  */
 static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
  * Rate limit direct write IO verify failures to this many per scond.
  */
 static unsigned int zfs_dio_write_verify_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
 static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
 static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 /*
  * Tunable parameter for debugging or performance analysis. Setting this
  * will cause pool corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 /*
  * Maximum and minimum ashift values that can be automatically set based on
  * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
  * is higher than the maximum value, it is intentionally limited here to not
  * excessively impact pool space efficiency.  Higher ashift values may still
  * be forced by vdev logical ashift or by user via ashift property, but won't
  * be set automatically as a performance optimization.
  */
 uint_t zfs_vdev_max_auto_ashift = 14;
 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 /*
  * VDEV checksum verification for Direct I/O writes. This is neccessary for
  * Linux, because anonymous pages can not be placed under write protection
  * during Direct I/O writes.
  */
 #if !defined(__FreeBSD__)
 uint_t zfs_vdev_direct_write_verify = 1;
 #else
 uint_t zfs_vdev_direct_write_verify = 0;
 #endif
 
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 		    (u_longlong_t)vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *const vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
 	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, *const *opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
  * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
 	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
 		return (vd->vdev_mg);
 }
 
 void
 vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	(void) vd, (void) remain_rs;
 
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
  * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
+uint64_t
+vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
+{
+	ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift));
+	uint64_t csize, psize = asize;
+	for (int c = 0; c < vd->vdev_children; c++) {
+		csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg);
+		psize = MIN(psize, csize);
+	}
+
+	return (psize);
+}
+
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 uint64_t
 vdev_default_min_asize(vdev_t *vd)
 {
 	return (vd->vdev_min_asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
 		    uint64_t));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 /*
  * Get the minimal allocation size for the top-level vdev.
  */
 uint64_t
 vdev_get_min_alloc(vdev_t *vd)
 {
 	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 
 	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 
 	return (min_alloc);
 }
 
 /*
  * Get the parity level for a top-level vdev.
  */
 uint64_t
 vdev_get_nparity(vdev_t *vd)
 {
 	uint64_t nparity = 0;
 
 	if (vd->vdev_ops->vdev_op_nparity != NULL)
 		nparity = vd->vdev_ops->vdev_op_nparity(vd);
 
 	return (nparity);
 }
 
 static int
 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t objid;
 	int err;
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (EINVAL);
 	}
 
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
 
 	if (err == ENOENT)
 		*value = vdev_prop_default_numeric(prop);
 
 	return (err);
 }
 
 /*
  * Get the number of data disks for a top-level vdev.
  */
 uint64_t
 vdev_get_ndisks(vdev_t *vd)
 {
 	uint64_t ndisks = 1;
 
 	if (vd->vdev_ops->vdev_op_ndisks != NULL)
 		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 
 	return (ndisks);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	int rc;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (rc);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		memcpy(newchild, pvd->vdev_child, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (oldc == 0)
 		return;
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	if (newc > 0) {
 		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 		for (int c = newc = 0; c < oldc; c++) {
 			if ((cvd = pvd->vdev_child[c]) != NULL) {
 				newchild[newc] = cvd;
 				cvd->vdev_id = newc++;
 			}
 		}
 	} else {
 		newchild = NULL;
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_load_guid();
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = zfs_range_tree_create(NULL,
 	    ZFS_RANGE_SEG64, NULL, 0, 0);
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
 	    &zfs_dio_write_verify_events_per_second, 1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
 	/*
 	 * Default Thresholds for tuning ZED
 	 */
 	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64,
 		    NULL, 0, 0);
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	const char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	const char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		const char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
 		 * classes input.
 		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 
 		/* spa_vdev_add() expects feature to be enabled */
 		if (ops == &vdev_draid_ops &&
 		    spa->spa_load_state != SPA_LOAD_CREATE &&
 		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	/*
 	 * Initialize the vdev specific data.  This is done before calling
 	 * vdev_alloc_common() since it may fail and this simplifies the
 	 * error reporting and cleanup code paths.
 	 */
 	void *tsd = NULL;
 	if (ops->vdev_op_init != NULL) {
 		rc = ops->vdev_op_init(spa, nv, &tsd);
 		if (rc != 0) {
 			return (rc);
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
 
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
 		vd->vdev_path = spa_strdup(tmp);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 	 * fault on a vdev and want it to persist across imports (like with
 	 * zpool offline -f).
 	 */
 	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_faulted = 1;
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
 		vd->vdev_devid = spa_strdup(tmp);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
 		vd->vdev_physpath = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &tmp) == 0)
 		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
 		vd->vdev_fru = spa_strdup(tmp);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement. Ignore pool ashift for vdev
 	 * attach case.
 	 */
 	if (alloctype != VDEV_ALLOC_ATTACH) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
 		    &vd->vdev_ashift);
 	} else {
 		vd->vdev_attaching = B_TRUE;
 	}
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	if (vd->vdev_ops == &vdev_root_ops &&
 	    (alloctype == VDEV_ALLOC_LOAD ||
 	    alloctype == VDEV_ALLOC_SPLIT ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 		    &vd->vdev_root_zap);
 	}
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 		    &vd->vdev_noalloc);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 		vd->vdev_rz_expanding = nvlist_exists(nv,
 		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 		    &vd->vdev_rebuild_txg);
 
 		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 			vdev_defer_resilver(vd);
 
 		/*
 		 * In general, when importing a pool we want to ignore the
 		 * persistent fault state, as the diagnosis made on another
 		 * system may not be valid in the current context.  The only
 		 * exception is if we forced a vdev to a persistently faulted
 		 * state with 'zpool offline -f'.  The persistent fault will
 		 * remain across imports until cleared.
 		 *
 		 * Local vdevs will remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				const char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 				else
 					vd->vdev_faulted = 0ULL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	if (vd->vdev_ops->vdev_op_fini != NULL)
 		vd->vdev_ops->vdev_op_fini(vd);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path)
 		spa_strfree(vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		zfs_range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	zfs_range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	mutex_destroy(&vd->vdev_trim_lock);
 	mutex_destroy(&vd->vdev_autotrim_lock);
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
 	cv_destroy(&vd->vdev_autotrim_kick_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
 	zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	if (tvd->vdev_log_mg)
 		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 	if (tvd->vdev_log_mg != NULL)
 		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
 	ASSERT0(tvd->vdev_noalloc);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
 	tvd->vdev_noalloc = svd->vdev_noalloc;
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	zfs_range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_noalloc = 0;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If pool not set for autoexpand, we need to also preserve
 		 * mvd's asize to prevent automatic expansion of cvd.
 		 * Otherwise if we are adjusting the mirror by attaching and
 		 * detaching children of non-uniform sizes, the mirror could
 		 * autoexpand, unexpectedly requiring larger devices to
 		 * re-establish the mirror.
 		 */
 		if (!cvd->vdev_spa->spa_autoexpand)
 			cvd->vdev_asize = mvd->vdev_asize;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 /*
  * Choose GCD for spa_gcd_alloc.
  */
 static uint64_t
 vdev_gcd(uint64_t a, uint64_t b)
 {
 	while (b != 0) {
 		uint64_t t = b;
 		b = a % b;
 		a = t;
 	}
 	return (a);
 }
 
 /*
  * Set spa_min_alloc and spa_gcd_alloc.
  */
 static void
 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
 	if (spa->spa_gcd_alloc == INT_MAX) {
 		spa->spa_gcd_alloc = min_alloc;
 	} else {
 		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
 		    spa->spa_gcd_alloc);
 	}
 }
 
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd);
 
 		if (!vd->vdev_islog) {
 			vd->vdev_log_mg = metaslab_group_create(
 			    spa_embedded_log_class(spa), vd);
 		}
 
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundary
 		 * setting had to wait until now.
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
 			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
 
 void
 vdev_update_nonallocating_space(vdev_t *vd, boolean_t add)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_mg->mg_class != spa_normal_class(spa))
 		return;
 
 	uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg);
 	uint64_t dspace = spa_deflate(spa) ?
 	    vdev_deflated_space(vd, raw_space) : raw_space;
 	if (add) {
 		spa->spa_nonallocating_dspace += dspace;
 	} else {
 		ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace);
 		spa->spa_nonallocating_dspace -= dspace;
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(spa->spa_meta_objset,
 			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
 	 * group.
 	 */
 	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
 		uint64_t smallest = UINT64_MAX;
 
 		/*
 		 * Note, we only search the new metaslabs, because the old
 		 * (pre-existing) ones may be active (e.g. have non-empty
 		 * range_tree's), and we don't move them to the new
 		 * metaslab_t.
 		 */
 		for (uint64_t m = oldc; m < newc; m++) {
 			uint64_t alloc =
 			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
 			if (alloc < smallest) {
 				slog_msid = m;
 				smallest = alloc;
 			}
 		}
 		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
 		/*
 		 * The metaslab was marked as dirty at the end of
 		 * metaslab_init(). Remove it from the dirty list so that we
 		 * can uninitialize and reinitialize it to the new class.
 		 */
 		if (txg != 0) {
 			(void) txg_list_remove_this(&vd->vdev_ms_list,
 			    slog_ms, txg);
 		}
 		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
 		metaslab_fini(slog_ms);
 		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
 		    &vd->vdev_ms[slog_msid]));
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is marked as non-allocating then don't
 	 * activate the metaslabs since we want to ensure that
 	 * no allocations are performed on this device.
 	 */
 	if (vd->vdev_noalloc) {
 		/* track non-allocating vdev space */
 		vdev_update_nonallocating_space(vd, B_TRUE);
 	} else if (!expanding) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_passivate(mg);
 		if (vd->vdev_log_mg != NULL) {
 			ASSERT(!vd->vdev_islog);
 			metaslab_group_passivate(vd->vdev_log_mg);
 		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 		vd->vdev_ms_count = 0;
 
 		for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
 			if (vd->vdev_log_mg != NULL)
 				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 		zio_link_t *zl;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
 		    vd->vdev_cant_read, vd->vdev_cant_write);
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 
 			/*
 			 * If this probe was initiated from zio pipeline, then
 			 * change the state in a spa_async_request. Probes that
 			 * were initiated from a vdev_open can change the state
 			 * as part of the open call.
 			 */
 			if (vps->vps_zio_done_probe) {
 				vd->vdev_fault_wanted = B_TRUE;
 				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
 			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
 		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_load_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_load_error = vdev_load(vd);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 static boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 #ifdef _KERNEL
 	if (zvol_is_zvol(vd->vdev_path))
 		return (B_TRUE);
 #endif
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Returns B_TRUE if the passed child should be opened.
  */
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
 	(void) vd;
 	return (B_TRUE);
 }
 
 /*
  * Open the requested child vdevs.  If any of the leaf vdevs are using
  * a ZFS volume then do the opens in a single thread.  This avoids a
  * deadlock when the current thread is holding the spa_namespace_lock.
  */
 static void
 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	int children = vd->vdev_children;
 
 	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 	vd->vdev_nonrot = B_TRUE;
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE)
 			continue;
 
 		if (tq == NULL || vdev_uses_zvols(vd)) {
 			cvd->vdev_open_error = vdev_open(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_open_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL)
 		taskq_wait(tq);
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}
 
 	if (tq != NULL)
 		taskq_destroy(tq);
 }
 
 /*
  * Open all child vdevs.
  */
 void
 vdev_open_children(vdev_t *vd)
 {
 	vdev_open_children_impl(vd, vdev_default_open_children_func);
 }
 
 /*
  * Conditionally open a subset of child vdevs.
  */
 void
 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	vdev_open_children_impl(vd, open_func);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
  * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
  * changed, this algorithm can not change, otherwise it would inconsistently
  * account for existing bp's.  We also hard-code txg 0 for the same reason
  * since expanded RAIDZ vdevs can use a different asize for different birth
  * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
 		    SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Choose the best of two ashifts, preferring one between logical ashift
  * (absolute minimum) and administrator defined maximum, otherwise take
  * the biggest of the two.
  */
 uint64_t
 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
 {
 	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
 		if (b <= logical || b > zfs_vdev_max_auto_ashift)
 			return (a);
 		else
 			return (MAX(a, b));
 	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
 		return (MAX(a, b));
 	return (b);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 static void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
 	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
 	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_physical_ashift));
 	} else {
 		/*
 		 * If the logical and physical ashifts are the same, then
 		 * we ensure that the top-level vdev's ashift is not smaller
 		 * than our minimum ashift value. For the unusual case
 		 * where logical ashift > physical ashift, we can't cap
 		 * the calculated ashift based on max ashift as that
 		 * would cause failures.
 		 * We still check if we need to increase it to match
 		 * the min ashift.
 		 */
 		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_ashift);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_fault_wanted = B_FALSE;
 	vd->vdev_remove_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/* Keep the device in removed state if unplugged */
 	if (error == ENOENT && vd->vdev_removed) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
 		    VDEV_AUX_NONE);
 		return (error);
 	}
 
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
 	 * or damaged: either way it's not safe for use, bail out of the open.
 	 */
 	if (osize > max_osize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_OPEN_FAILED);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
 	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	/*
 	 * If the vdev was expanded, record this so that we can re-create the
 	 * uberblock rings in labels {2,3}, during the next sync.
 	 */
 	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
 		vd->vdev_copy_uberblocks = B_TRUE;
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * We can always set the logical/physical ashift members since
 	 * their values are only used to calculate the vdev_ashift when
 	 * the device is first added to the config. These values should
 	 * not be used for anything else since they may change whenever
 	 * the device is reopened and we don't store them in the label.
 	 */
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift,
 	    vd->vdev_logical_ashift);
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For compatibility, a different ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 
 		/*
 		 * If the vdev_ashift was not overridden at creation time
 		 * (0) or the override value is impossible for the device,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
 		if (vd->vdev_ashift < vd->vdev_logical_ashift) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
 				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 				    VDEV_AUX_ASHIFT_TOO_BIG);
 				return (SET_ERROR(EDOM));
 			}
 
 			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
 			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_ASHIFT);
 			return (SET_ERROR(EDOM));
 		}
 	} else {
 		/*
 		 * Make sure the alignment required hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			(void) zfs_ereport_post(
 			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
 			    spa, vd, NULL, NULL, 0);
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (SET_ERROR(EDOM));
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the minimum allocation size.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
 		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
 	 * this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
 		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
 
 	return (0);
 }
 
 static void
 vdev_validate_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_validate_thread = curthread;
 	vd->vdev_validate_error = vdev_validate(vd);
 	vd->vdev_validate_thread = NULL;
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	taskq_t *tq = NULL;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 	int children = vd->vdev_children;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	if (children > 0) {
 		tq = taskq_create("vdev_validate", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			vdev_validate_child(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 	for (int c = 0; c < children; c++) {
 		int error = vd->vdev_child[c]->vdev_validate_error;
 
 		if (error != 0)
 			return (SET_ERROR(EBADF));
 	}
 
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
 	if (svd != NULL && *dvd != NULL) {
 		if (strcmp(svd, *dvd) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
 			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
 			    *dvd, svd);
 			spa_strfree(*dvd);
 			*dvd = spa_strdup(svd);
 		}
 	} else if (svd != NULL) {
 		*dvd = spa_strdup(svd);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)guid, *dvd);
 	}
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	char *old, *new;
 
 	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_physpath", svd->vdev_physpath,
 	    &dvd->vdev_physpath, dvd->vdev_guid);
 
 	/*
 	 * Our enclosure sysfs path may have changed between imports
 	 */
 	old = dvd->vdev_enc_sysfs_path;
 	new = svd->vdev_enc_sysfs_path;
 	if ((old != NULL && new == NULL) ||
 	    (old == NULL && new != NULL) ||
 	    ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
 		    "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 		    old, new);
 
 		if (dvd->vdev_enc_sysfs_path)
 			spa_strfree(dvd->vdev_enc_sysfs_path);
 
 		if (svd->vdev_enc_sysfs_path) {
 			dvd->vdev_enc_sysfs_path = spa_strdup(
 			    svd->vdev_enc_sysfs_path);
 		} else {
 			dvd->vdev_enc_sysfs_path = NULL;
 		}
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	ASSERT(vd != NULL);
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	ASSERT(spa_is_root(vd->vdev_spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache) {
 			/*
 			 * In case the vdev is present we should evict all ARC
 			 * buffers and pointers to log blocks and reclaim their
 			 * space before restoring its contents to L2ARC.
 			 */
 			if (l2arc_vdev_present(vd)) {
 				l2arc_rebuild_vdev(vd, B_TRUE);
 			} else {
 				l2arc_add_vdev(spa, vd);
 			}
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Recheck if resilver is still needed and cancel any
 	 * scheduled resilver if resilver is unneeded.
 	 */
 	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
 	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
 		mutex_enter(&spa->spa_async_lock);
 		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
 		mutex_exit(&spa->spa_async_lock);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size       metaslab count
 	 *  --------------|-----------------
 	 *      < 8GB        ~16
 	 *  8GB   - 100GB   one per 512MB
 	 *  100GB - 3TB     ~200
 	 *  3TB   - 2PB     one per 16GB
 	 *      > 2PB       ~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_contains(rt, txg, size))
 		zfs_range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * This isn't a problem but it can result in devices being tried
 	 * which are known to not have the data.  In which case, the import
 	 * is relying on the checksum to ensure that we get the right data.
 	 * Note that while importing we are only reading the MOS, which is
 	 * always checksummed.
 	 */
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_is_empty(rt))
 		dirty = zfs_range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = zfs_range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Check if the txg falls within the range which must be
  * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	(void) dva, (void) psize;
 
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
 
 	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
 }
 
 /*
  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
 	    phys_birth));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_deferred)
 		return (B_FALSE);
 
 	if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	if (rebuild_done) {
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		/* Rebuild not initiated by attach */
 		if (vd->vdev_rebuild_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a rebuild completes without error then all missing data
 		 * up to the rebuild max txg has been reconstructed and the DTL
 		 * is eligible for excision.
 		 */
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
 		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
 			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
 			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
 			return (B_TRUE);
 		}
 	} else {
 		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
 
 		/* Resilver not initiated by attach */
 		if (vd->vdev_resilver_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a resilver is initiated the scan will assign the
 		 * scn_max_txg value to the highest txg value that exists
 		 * in all DTLs. If this device's max DTL is not part of this
 		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
 		 * then it is not eligible for excision.
 		 */
 		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
 			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
  * write operations will be issued to the pool.
  */
 static void
 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done, rebuild_done, faulting);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		boolean_t check_excise = B_FALSE;
 		boolean_t wasempty = B_TRUE;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If requested, pretend the scan or rebuild completed cleanly.
 		 */
 		if (zfs_scan_ignore_errors) {
 			if (scn != NULL)
 				scn->scn_phys.scn_errors = 0;
 			if (vr != NULL)
 				vr->vr_rebuild_phys.vrp_errors = 0;
 		}
 
 		if (scrub_txg != 0 &&
 		    !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 			wasempty = B_FALSE;
 			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
 			    "dtl:%llu/%llu errors:%llu",
 			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
 			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
 			    (u_longlong_t)vdev_dtl_min(vd),
 			    (u_longlong_t)vdev_dtl_max(vd),
 			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
 		}
 
 		/*
 		 * If we've completed a scrub/resilver or a rebuild cleanly
 		 * then determine if this vdev should remove any DTLs. We
 		 * only want to excise regions on vdevs that were available
 		 * during the entire duration of this scan.
 		 */
 		if (rebuild_done &&
 		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
 			check_excise = B_TRUE;
 		} else {
 			if (spa->spa_scrub_started ||
 			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
 				check_excise = B_TRUE;
 			}
 		}
 
 		if (scrub_txg && check_excise &&
 		    vdev_dtl_should_excise(vd, rebuild_done)) {
 			/*
 			 * We completed a scrub, resilver or rebuild up to
 			 * scrub_txg.  If we did it without rebooting, then
 			 * the scrub dtl will be valid, so excise the old
 			 * region and fold in the scrub dtl.  Otherwise,
 			 * leave the dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 
 			if (!zfs_range_tree_is_empty(
 			    vd->vdev_dtl[DTL_MISSING])) {
 				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
 				    (u_longlong_t)vdev_dtl_min(vd),
 				    (u_longlong_t)vdev_dtl_max(vd));
 			} else if (!wasempty) {
 				zfs_dbgmsg("DTL_MISSING is now empty");
 			}
 		}
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL,
 			    NULL);
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 
 		/*
 		 * For the faulting case, treat members of a replacing vdev
 		 * as if they are not available. It's more likely than not that
 		 * a vdev in a replacing vdev could encounter read errors so
 		 * treat it as not being able to contribute.
 		 */
 		if (!vdev_readable(vd) ||
 		    (faulting && vd->vdev_parent != NULL &&
 		    vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
 			zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		} else {
 			zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 		}
 
 		/*
 		 * If the vdev was resilvering or rebuilding and no longer
 		 * has any DTLs then reset the appropriate flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (txg != 0 &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			if (vd->vdev_rebuild_txg != 0) {
 				vd->vdev_rebuild_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			} else if (vd->vdev_resilver_txg != 0) {
 				vd->vdev_resilver_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			}
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 	} else {
 		mutex_enter(&vd->vdev_dtl_lock);
 		for (int t = 0; t < DTL_TYPES; t++) {
 			/* account for child's outage in parent's missing map */
 			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 			if (t == DTL_SCRUB) {
 				/* leaf vdevs only */
 				continue;
 			}
 			if (t == DTL_PARTIAL) {
 				/* i.e. non-zero */
 				minref = 1;
 			} else if (vdev_get_nparity(vd) != 0) {
 				/* RAIDZ, DRAID */
 				minref = vdev_get_nparity(vd) + 1;
 			} else {
 				/* any kind of mirror */
 				minref = vd->vdev_children;
 			}
 			space_reftree_create(&reftree);
 			for (int c = 0; c < vd->vdev_children; c++) {
 				vdev_t *cvd = vd->vdev_child[c];
 				mutex_enter(&cvd->vdev_dtl_lock);
 				space_reftree_add_map(&reftree,
 				    cvd->vdev_dtl[s], 1);
 				mutex_exit(&cvd->vdev_dtl_lock);
 			}
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[t], minref);
 			space_reftree_destroy(&reftree);
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
 
 	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
 		raidz_dtl_reassessed(vd);
 	}
 }
 
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done)
 {
 	return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
 	    rebuild_done, B_FALSE));
 }
 
 /*
  * Iterate over all the vdevs except spare, and post kobj events
  */
 void
 vdev_post_kobj_evt(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
 	    vd->vdev_kobj_flag == B_FALSE) {
 		vd->vdev_kobj_flag = B_TRUE;
 		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_post_kobj_evt(vd->vdev_child[c]);
 }
 
 /*
  * Iterate over all the vdevs except spare, and clear kobj events
  */
 void
 vdev_clear_kobj_evt(vdev_t *vd)
 {
 	vd->vdev_kobj_flag = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rt;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		/*
 		 * If the dtl cannot be sync'd there is no need to open it.
 		 */
 		if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
 			return (0);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
 			zfs_range_tree_walk(rt, zfs_range_tree_add,
 			    vd->vdev_dtl[DTL_MISSING]);
 			mutex_exit(&vd->vdev_dtl_lock);
 		}
 
 		zfs_range_tree_vacate(rt, NULL, NULL);
 		zfs_range_tree_destroy(rt);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
 	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
 			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
 		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 static void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	zfs_range_tree_vacate(rtsync, NULL, NULL);
 
 	zfs_range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be
  * - offlined
  * - detached
  * - removed
  * - faulted
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 	boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 
 	if (!required && zio_injection_enabled) {
 		required = !!zio_handle_device_injection(vd, NULL,
 		    SET_ERROR(ECHILD));
 	}
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
  * will contain either the checkpoint spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int children = vd->vdev_children;
 	int error = 0;
 	taskq_t *tq = NULL;
 
 	/*
 	 * It's only worthwhile to use the taskq for the root vdev, because the
 	 * slow part is metaslab_init, and that only happens for top-level
 	 * vdevs.
 	 */
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
 		tq = taskq_create("vdev_load", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			cvd->vdev_load_error = vdev_load(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_load_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int error = vd->vdev_child[c]->vdev_load_error;
 
 		if (error != 0)
 			return (error);
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	if (vd->vdev_ops == &vdev_raidz_ops) {
 		error = vdev_raidz_load(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str);
 		if (error == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		} else if (error != ENOENT) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 			return (error);
 		}
 	}
 
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		uint64_t failfast;
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
 		    1, &failfast);
 		if (error == 0) {
 			vd->vdev_failfast = failfast & 1;
 		} else if (error == ENOENT) {
 			vd->vdev_failfast = vdev_prop_default_numeric(
 			    VDEV_PROP_FAILFAST);
 		} else {
 			vdev_dbgmsg(vd,
 			    "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 		}
 	}
 
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		error = vdev_rebuild_load(vd);
 		if (error && error != ENOTSUP) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
 			    "failed [error=%d]", error);
 			return (error);
 		}
 	}
 
 	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
 		uint64_t zapobj;
 
 		if (vd->vdev_top_zap != 0)
 			zapobj = vd->vdev_top_zap;
 		else
 			zapobj = vd->vdev_leaf_zap;
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
 		    &vd->vdev_checksum_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
 		    &vd->vdev_checksum_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
 		    &vd->vdev_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
 		    &vd->vdev_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
 		    &vd->vdev_slow_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
 		    &vd->vdev_slow_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj;
 		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
 		if (error == 0 && checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		} else if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
 			    "checkpoint space map object from vdev ZAP "
 			    "[error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object;
 	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
 	if (error == 0 && obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	} else if (error != 0) {
 		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
 		    "space map object from vdev ZAP [error=%d]", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 static void
 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(vd->vdev_spa);
 
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
 	if (err == ENOENT)
 		return;
 	VERIFY0(err);
 
 	VERIFY0(dmu_object_free(mos, object, tx));
 	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_sync_reassess(vd->vdev_log_mg);
 	}
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
+uint64_t
+vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg)
+{
+	return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg));
+}
 
 /*
  * Return the amount of space that should be (or was) allocated for the given
  * psize (compressed block size) in the given TXG. Note that for expanded
  * RAIDZ vdevs, the size allocated for older BP's may be larger. See
- * vdev_raidz_asize().
+ * vdev_raidz_psize_to_asize().
  */
 uint64_t
 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
-	return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
+	return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * If user did a 'zpool offline -f' then make the fault persist across
 	 * reboots.
 	 */
 	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
 		/*
 		 * There are two kinds of forced faults: temporary and
 		 * persistent.  Temporary faults go away at pool import, while
 		 * persistent faults stay set.  Both types of faults can be
 		 * cleared with a zpool clear.
 		 *
 		 * We tell if a vdev is persistently faulted by looking at the
 		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
 		 * import then it's a persistent fault.  Otherwise, it's
 		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
 		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
 		 * tells vdev_config_generate() (which gets run later) to set
 		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
 		 */
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_tmpoffline = B_FALSE;
 		aux = VDEV_AUX_EXTERNAL;
 	} else {
 		vd->vdev_tmpoffline = B_TRUE;
 	}
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_remove_wanted(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	/*
 	 * If the vdev is already removed, or expanding which can trigger
 	 * repartition add/remove events, then don't do anything.
 	 */
 	if (vd->vdev_removed || vd->vdev_expanding)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	/*
 	 * Confirm the vdev has been removed, otherwise don't do anything.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
 
 	vd->vdev_remove_wanted = B_TRUE;
 	spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
 			    spa->spa_autoexpand);
 		vd->vdev_expansion_time = gethrestime_sec();
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	/*
 	 * Restart trimming if necessary. We do not restart trimming for cache
 	 * devices here. This is triggered by l2arc_rebuild_vdev()
 	 * asynchronously for the whole device or in l2arc_evict() as it evicts
 	 * space for upcoming writes.
 	 */
 	mutex_enter(&vd->vdev_trim_lock);
 	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
 	    vd->vdev_trim_thread == NULL &&
 	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 		/*
 		 * Asynchronously detach spare vdev if resilver or
 		 * rebuild is not required
 		 */
 		if (vd->vdev_unspare &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
 		    !vdev_rebuild_active(tvd))
 			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
 	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				ASSERT3U(space_map_allocated(
 				    tvd->vdev_checkpoint_sm), !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect  or removed vdev.
 	 */
 	if (!vdev_is_concrete(vd) || vd->vdev_removed)
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 		/*
 		 * When reopening in response to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 		vd->vdev_stat.vs_aux = 0;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		/* If a resilver isn't required, check if vdevs can be culled */
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	/* Clear recent error events cache (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
 	/*
 	 * Exclude the dRAID spare when aggregating to avoid double counting
 	 * the ops and bytes.  These IOs are counted by the physical leaves.
 	 */
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
 
 	cvs->vs_scan_removing = cvd->vdev_removing;
 }
 
 /*
  * Get extended stats
  */
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
 	(void) cvd;
 
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
 			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
 			vsx->vsx_total_histo[t][b] +=
 			    cvsx->vsx_total_histo[t][b];
 		}
 	}
 
 	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
 			vsx->vsx_queue_histo[t][b] +=
 			    cvsx->vsx_queue_histo[t][b];
 		}
 		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
 		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
 			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
 			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
 	}
 
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 static void
 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	int t;
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		if (vs) {
 			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
 			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
 		}
 		if (vsx)
 			memset(vsx, 0, sizeof (*vsx));
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
 
 			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
 			if (vs)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
 		}
 	} else {
 		/*
 		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
 		 * other leaf stats are updated in vdev_stat_update().
 		 */
 		if (!vsx)
 			return;
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
 		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
 			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
 
 void
 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
 		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
 			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
 			vs->vs_initialize_bytes_done =
 			    vd->vdev_initialize_bytes_done;
 			vs->vs_initialize_bytes_est =
 			    vd->vdev_initialize_bytes_est;
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
 
 			/*
 			 * Report manual TRIM progress. Since we don't have
 			 * the manual TRIM locks held, this is only an
 			 * estimate (although fairly accurate one).
 			 */
 			vs->vs_trim_notsup = !vd->vdev_has_trim;
 			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
 			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
 			vs->vs_trim_state = vd->vdev_trim_state;
 			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 
 			/* Set when there is a deferred resilver. */
 			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 		}
 
 		/*
 		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
 			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
 			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
 		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
 			vs->vs_physical_ashift = vd->vdev_physical_ashift;
 		else
 			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
 		 * non-auxiliary, concrete devices.
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
 			/*
 			 * The vdev fragmentation rating doesn't take into
 			 * account the embedded slog metaslab (vdev_log_mg).
 			 * Since it's only one metaslab, it would have a tiny
 			 * impact on the overall fragmentation.
 			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
 		vs->vs_noalloc = MAX(vd->vdev_noalloc,
 		    tvd ? tvd->vdev_noalloc : 0);
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	return (vdev_get_stats_ex(vd, vs, NULL));
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 /* Suppress ASAN false positive */
 #ifdef __SANITIZE_ADDRESS__
 	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
 	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
 #else
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
 #endif
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			/*
 			 * Repair is the result of a resilver issued by the
 			 * scan thread (spa_sync).
 			 */
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			/*
 			 * Repair is the result of a rebuild issued by the
 			 * rebuild thread (vdev_rebuild_thread).  To avoid
 			 * double counting repaired bytes the virtual dRAID
 			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
 				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
 				if (vd->vdev_ops->vdev_op_leaf &&
 				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
 				}
 				vs->vs_rebuild_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		/*
 		 * The bytes/ops/histograms are recorded at the leaf level and
 		 * aggregated into the higher level vdevs in vdev_get_stats().
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
 			zio_type_t vs_type = type;
 			zio_priority_t priority = zio->io_priority;
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
 			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
 				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
 			 * reporting use the priority to categorize the IO.
 			 * Only the following are reported to user space:
 			 *
 			 *   ZIO_PRIORITY_SYNC_READ,
 			 *   ZIO_PRIORITY_SYNC_WRITE,
 			 *   ZIO_PRIORITY_ASYNC_READ,
 			 *   ZIO_PRIORITY_ASYNC_WRITE,
 			 *   ZIO_PRIORITY_SCRUB,
 			 *   ZIO_PRIORITY_TRIM,
 			 *   ZIO_PRIORITY_REBUILD.
 			 */
 			if (priority == ZIO_PRIORITY_INITIALIZING) {
 				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
 				priority = ZIO_PRIORITY_ASYNC_WRITE;
 			} else if (priority == ZIO_PRIORITY_REMOVAL) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			vs->vs_ops[vs_type]++;
 			vs->vs_bytes[vs_type] += psize;
 
 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			} else {
 				vsx->vsx_ind_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			}
 
 			if (zio->io_delta && zio->io_delay) {
 				vsx->vsx_queue_histo[priority]
 				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
 				vsx->vsx_disk_histo[type]
 				    [L_HISTO(zio->io_delay)]++;
 				vsx->vsx_total_histo[type]
 				    [L_HISTO(zio->io_delta)]++;
 			}
 		}
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	(void) defer_delta;
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * children's, thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	/* ensure we won't underflow */
 	if (alloc_delta < 0) {
 		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
 	}
 
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		ASSERT(!vd->vdev_isl2cache);
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		/*
 		 * Since vdev_offline() code path is already in an offline
 		 * state we can miss a statechange event to OFFLINE. Check
 		 * the previous state to catch this condition.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (state == VDEV_STATE_OFFLINE) &&
 		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
 			/* post an offline state change */
 			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
 		}
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			case VDEV_AUX_BAD_ASHIFT:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
 			    save_state);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
 	 */
 	if (vd->vdev_ops->vdev_op_leaf) {
 		/* preserve original state from a vdev_reopen() */
 		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
 		    (vd->vdev_prevstate != vd->vdev_state) &&
 		    (save_state <= VDEV_STATE_CLOSED))
 			save_state = vd->vdev_prevstate;
 
 		/* filter out state change due to initial vdev_open */
 		if (save_state > VDEV_STATE_CLOSED)
 			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
 			return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
 	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
 	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	VERIFY3U(pvd->vdev_children, >, 1);
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	ASSERT3P(pvd->vdev_child, !=, NULL);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd, const char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd, tag);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			zfs_dbgmsg("slow vdev: %s has %u active IOs",
 			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
 
 void
 vdev_defer_resilver(vdev_t *vd)
 {
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vd->vdev_resilver_deferred = B_TRUE;
 	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
 }
 
 /*
  * Clears the resilver deferred flag on all leaf devs under vd. Returns
  * B_TRUE if we have devices that need to be resilvered and are available to
  * accept resilver I/Os.
  */
 boolean_t
 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 {
 	boolean_t resilver_needed = B_FALSE;
 	spa_t *spa = vd->vdev_spa;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
 	}
 
 	if (vd == spa->spa_root_vdev &&
 	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
 		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 		vdev_config_dirty(vd);
 		spa->spa_resilver_deferred = B_FALSE;
 		return (resilver_needed);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (resilver_needed);
 
 	vd->vdev_resilver_deferred = B_FALSE;
 
 	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
 	    vdev_resilver_needed(vd, NULL, NULL));
 }
 
 boolean_t
 vdev_xlate_is_empty(zfs_range_seg64_t *rs)
 {
 	return (rs->rs_start == rs->rs_end);
 }
 
 /*
  * Translate a logical range to the first contiguous physical range for the
  * specified vdev_t.  This function is initially called with a leaf vdev and
  * will walk each parent vdev until it reaches a top-level vdev. Once the
  * top-level is reached the physical range is initialized and the recursive
  * function begins to unwind. As it unwinds it calls the parent's vdev
  * specific translation function to do the real conversion.
  */
 void
 vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
 		    remain_rs);
 	} else {
 		/*
 		 * We've reached the top-level vdev, initialize the physical
 		 * range to the logical range and set an empty remaining
 		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
 
 		remain_rs->rs_start = logical_rs->rs_start;
 		remain_rs->rs_end = logical_rs->rs_start;
 
 		return;
 	}
 
 	vdev_t *pvd = vd->vdev_parent;
 	ASSERT3P(pvd, !=, NULL);
 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 
 	/*
 	 * As this recursive function unwinds, translate the logical
 	 * range into its physical and any remaining components by calling
 	 * the vdev specific translate function.
 	 */
 	zfs_range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }
 
 void
 vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg)
 {
 	zfs_range_seg64_t iter_rs = *logical_rs;
 	zfs_range_seg64_t physical_rs;
 	zfs_range_seg64_t remain_rs;
 
 	while (!vdev_xlate_is_empty(&iter_rs)) {
 
 		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
 
 		/*
 		 * With raidz and dRAID, it's possible that the logical range
 		 * does not live on this leaf vdev. Only when there is a non-
 		 * zero physical size call the provided function.
 		 */
 		if (!vdev_xlate_is_empty(&physical_rs))
 			func(arg, &physical_rs);
 
 		iter_rs = remain_rs;
 	}
 }
 
 static char *
 vdev_name(vdev_t *vd, char *buf, int buflen)
 {
 	if (vd->vdev_path == NULL) {
 		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
 			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
 		} else if (!vd->vdev_ops->vdev_op_leaf) {
 			snprintf(buf, buflen, "%s-%llu",
 			    vd->vdev_ops->vdev_op_type,
 			    (u_longlong_t)vd->vdev_id);
 		}
 	} else {
 		strlcpy(buf, vd->vdev_path, buflen);
 	}
 	return (buf);
 }
 
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
  */
 boolean_t
 vdev_replace_in_progress(vdev_t *vdev)
 {
 	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev->vdev_ops == &vdev_replacing_ops)
 		return (B_TRUE);
 
 	/*
 	 * A 'spare' vdev indicates that we have a replace in progress, unless
 	 * it has exactly two children, and the second, the hot spare, has
 	 * finished being resilvered.
 	 */
 	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
 	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
 		return (B_TRUE);
 
 	for (int i = 0; i < vdev->vdev_children; i++) {
 		if (vdev_replace_in_progress(vdev->vdev_child[i]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static void
 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *vd;
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
 	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
 	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
 
 	/* this vdev could get removed while waiting for this sync task */
 	if (vd == NULL)
 		return;
 
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		panic("unexpected vdev type");
 	}
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
 				strval = fnvpair_value_string(elem);
 				if (strlen(strval) == 0) {
 					/* remove the property if value == "" */
 					(void) zap_remove(mos, objid, propname,
 					    tx);
 				} else {
 					VERIFY0(zap_update(mos, objid, propname,
 					    1, strlen(strval) + 1, strval, tx));
 				}
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			}
 			break;
 		default:
 			/* normalize the property name */
 			propname = vdev_prop_to_name(prop);
 			proptype = vdev_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos, objid, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(vdev_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos, objid, propname,
 				    sizeof (uint64_t), 1, &intval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%lld",
 				    (u_longlong_t)vdev_guid,
 				    nvpair_name(elem), (longlong_t)intval);
 			} else {
 				panic("invalid vdev property type %u",
 				    nvpair_type(elem));
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 int
 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	nvlist_t *nvprops;
 	int error = 0;
 
 	ASSERT(vd != NULL);
 
 	/* Check that vdev has a zap we can use */
 	if (vd->vdev_root_zap == 0 &&
 	    vd->vdev_top_zap == 0 &&
 	    vd->vdev_leaf_zap == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
 	    &nvprops) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 		vdev_prop_t prop = vdev_name_to_prop(propname);
 		uint64_t intval = 0;
 		const char *strval = NULL;
 
 		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
 			error = EINVAL;
 			goto end;
 		}
 
 		if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
 			error = EROFS;
 			goto end;
 		}
 
 		/* Special Processing */
 		switch (prop) {
 		case VDEV_PROP_PATH:
 			if (vd->vdev_path == NULL) {
 				error = EROFS;
 				break;
 			}
 			if (nvpair_value_string(elem, &strval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			/* New path must start with /dev/ */
 			if (strncmp(strval, "/dev/", 5)) {
 				error = EINVAL;
 				break;
 			}
 			error = spa_vdev_setpath(spa, vdev_guid, strval);
 			break;
 		case VDEV_PROP_ALLOCATING:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			if (intval != vd->vdev_noalloc)
 				break;
 			if (intval == 0)
 				error = spa_vdev_noalloc(spa, vdev_guid);
 			else
 				error = spa_vdev_alloc(spa, vdev_guid);
 			break;
 		case VDEV_PROP_FAILFAST:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_n = intval;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_t = intval;
 			break;
 		case VDEV_PROP_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_n = intval;
 			break;
 		case VDEV_PROP_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_t = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_n = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_t = intval;
 			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
 		}
 end:
 		if (error != 0) {
 			intval = error;
 			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
 			return (error);
 		}
 	}
 
 	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
 	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = 0;
 	uint64_t objid;
 	uint64_t vdev_guid;
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;
 	uint64_t intval = 0;
 	char *strval = NULL;
 	const char *propname = NULL;
 	vdev_prop_t prop;
 
 	ASSERT(vd != NULL);
 	ASSERT(mos != NULL);
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(objid != 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	if (nvprops != NULL) {
 		char namebuf[64] = { 0 };
 
 		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 			intval = 0;
 			strval = NULL;
 			propname = nvpair_name(elem);
 			prop = vdev_name_to_prop(propname);
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			uint64_t integer_size, num_integers;
 
 			switch (prop) {
 			/* Special Read-only Properties */
 			case VDEV_PROP_NAME:
 				strval = vdev_name(vd, namebuf,
 				    sizeof (namebuf));
 				if (strval == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CAPACITY:
 				/* percent used */
 				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
 				    (vd->vdev_stat.vs_alloc * 100 /
 				    vd->vdev_stat.vs_dspace);
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_STATE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_state, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_GUID:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_guid, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_asize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PSIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_psize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASHIFT:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_ashift, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace -
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ALLOCATED:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_EXPANDSZ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRAGMENTATION:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_fragmentation,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARITY:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PATH:
 				if (vd->vdev_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_DEVID:
 				if (vd->vdev_devid == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PHYS_PATH:
 				if (vd->vdev_physpath == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ENC_PATH:
 				if (vd->vdev_enc_sysfs_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRU:
 				if (vd->vdev_fru == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARENT:
 				if (vd->vdev_parent != NULL) {
 					strval = vdev_name(vd->vdev_parent,
 					    namebuf, sizeof (namebuf));
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_CHILDREN:
 				if (vd->vdev_children > 0)
 					strval = kmem_zalloc(ZAP_MAXVALUELEN,
 					    KM_SLEEP);
 				for (uint64_t i = 0; i < vd->vdev_children;
 				    i++) {
 					const char *vname;
 
 					vname = vdev_name(vd->vdev_child[i],
 					    namebuf, sizeof (namebuf));
 					if (vname == NULL)
 						vname = "(unknown)";
 					if (strlen(strval) > 0)
 						strlcat(strval, ",",
 						    ZAP_MAXVALUELEN);
 					strlcat(strval, vname, ZAP_MAXVALUELEN);
 				}
 				if (strval != NULL) {
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 					kmem_free(strval, ZAP_MAXVALUELEN);
 				}
 				continue;
 			case VDEV_PROP_NUMCHILDREN:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_children, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_READ_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_read_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_WRITE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_write_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CHECKSUM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_checksum_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_INITIALIZE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_initialize_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_TRIM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_trim_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SLOW_IOS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_slow_ios,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_removing, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_RAIDZ_EXPANDING:
 				/* Only expose this for raidz */
 				if (vd->vdev_ops == &vdev_raidz_ops) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_rz_expanding,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_has_trim,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			/* Numeric Properites */
 			case VDEV_PROP_ALLOCATING:
 				/* Leaf vdevs cannot have this property */
 				if (vd->vdev_mg == NULL &&
 				    vd->vdev_top != NULL) {
 					src = ZPROP_SRC_NONE;
 					intval = ZPROP_BOOLEAN_NA;
 				} else {
 					err = vdev_prop_get_int(vd, prop,
 					    &intval);
 					if (err && err != ENOENT)
 						break;
 
 					if (intval ==
 					    vdev_prop_default_numeric(prop))
 						src = ZPROP_SRC_DEFAULT;
 					else
 						src = ZPROP_SRC_LOCAL;
 				}
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
 				strval = NULL;
 
 				err = zap_lookup(mos, objid, nvpair_name(elem),
 				    sizeof (uint64_t), 1, &intval);
 				if (err == ENOENT) {
 					intval = vdev_prop_default_numeric(
 					    prop);
 					err = 0;
 				} else if (err) {
 					break;
 				}
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
 			case VDEV_PROP_IO_T:
 			case VDEV_PROP_SLOW_IO_N:
 			case VDEV_PROP_SLOW_IO_T:
 				err = vdev_prop_get_int(vd, prop, &intval);
 				if (err && err != ENOENT)
 					break;
 
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 				else
 					src = ZPROP_SRC_LOCAL;
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			/* Text Properties */
 			case VDEV_PROP_COMMENT:
 				/* Exists in the ZAP below */
 				/* FALLTHRU */
 			case VDEV_PROP_USERPROP:
 				/* User Properites */
 				src = ZPROP_SRC_LOCAL;
 
 				err = zap_length(mos, objid, nvpair_name(elem),
 				    &integer_size, &num_integers);
 				if (err)
 					break;
 
 				switch (integer_size) {
 				case 8:
 					/* User properties cannot be integers */
 					err = EINVAL;
 					break;
 				case 1:
 					/* string property */
 					strval = kmem_alloc(num_integers,
 					    KM_SLEEP);
 					err = zap_lookup(mos, objid,
 					    nvpair_name(elem), 1,
 					    num_integers, strval);
 					if (err) {
 						kmem_free(strval,
 						    num_integers);
 						break;
 					}
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, src);
 					kmem_free(strval, num_integers);
 					break;
 				}
 				break;
 			default:
 				err = ENOENT;
 				break;
 			}
 			if (err)
 				break;
 		}
 	} else {
 		/*
 		 * Get all properties from the MOS vdev property object.
 		 */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, mos, objid);
 		    (err = zap_cursor_retrieve(&zc, za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			intval = 0;
 			strval = NULL;
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			propname = za->za_name;
 
 			switch (za->za_integer_length) {
 			case 8:
 				/* We do not allow integer user properties */
 				/* This is likely an internal value */
 				break;
 			case 1:
 				/* string property */
 				strval = kmem_alloc(za->za_num_integers,
 				    KM_SLEEP);
 				err = zap_lookup(mos, objid, za->za_name, 1,
 				    za->za_num_integers, strval);
 				if (err) {
 					kmem_free(strval, za->za_num_integers);
 					break;
 				}
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    src);
 				kmem_free(strval, za->za_num_integers);
 				break;
 
 			default:
 				break;
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	if (err && err != ENOENT) {
 		return (err);
 	}
 
 	return (0);
 }
 
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
 	"Default lower limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
 	"Default upper limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
 	"Rate limit hung IO (deadman) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
 	"Rate Direct I/O write verify events to this many per second");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
 	"Direct I/O writes will perform for checksum verification before "
 	"commiting write");
 
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below ZED threshold).");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 	"Bypass vdev_validate()");
 
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
 	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl,
 		param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW,
 		"RAIDZ implementation");
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index d39a05458fe7..e0fafd0da2d9 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1,2822 +1,2825 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2018 Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/abd.h>
 #include <sys/zio.h>
 #include <sys/nvpair.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <zfs_fletcher.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_draid_io_verify() */
 #endif
 
 /*
  * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
  * comprised of multiple raidz redundancy groups which are spread over the
  * dRAID children. To ensure an even distribution, and avoid hot spots, a
  * permutation mapping is applied to the order of the dRAID children.
  * This mixing effectively distributes the parity columns evenly over all
  * of the disks in the dRAID.
  *
  * This is beneficial because it means when resilvering all of the disks
  * can participate thereby increasing the available IOPs and bandwidth.
  * Furthermore, by reserving a small fraction of each child's total capacity
  * virtual distributed spare disks can be created. These spares similarly
  * benefit from the performance gains of spanning all of the children. The
  * consequence of which is that resilvering to a distributed spare can
  * substantially reduce the time required to restore full parity to pool
  * with a failed disks.
  *
  * === dRAID group layout ===
  *
  * First, let's define a "row" in the configuration to be a 16M chunk from
  * each physical drive at the same offset. This is the minimum allowable
  * size since it must be possible to store a full 16M block when there is
  * only a single data column. Next, we define a "group" to be a set of
  * sequential disks containing both the parity and data columns. We allow
  * groups to span multiple rows in order to align any group size to any
  * number of physical drives. Finally, a "slice" is comprised of the rows
  * which contain the target number of groups. The permutation mappings
  * are applied in a round robin fashion to each slice.
  *
  * Given D+P drives in a group (including parity drives) and C-S physical
  * drives (not including the spare drives), we can distribute the groups
  * across R rows without remainder by selecting the least common multiple
  * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
  *
  * In the example below, there are C=14 physical drives in the configuration
  * with S=2 drives worth of spare capacity. Each group has a width of 9
  * which includes D=8 data and P=1 parity drive. There are 4 groups and
  * 3 rows per slice.  Each group has a size of 144M (16M * 9) and a slice
  * size is 576M (144M * 4). When allocating from a dRAID each group is
  * filled before moving on to the next as show in slice0 below.
  *
  *             data disks (8 data + 1 parity)          spares (2)
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  ^  | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
  *  |  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  |  |              group 0              |  group 1..|       |
  *  |  +-----------------------------------+-----------+-------|
  *  |  | 0   1   2   3   4   5   6   7   8 | 36  37  38|       |  r
  *  |  | 9   10  11  12  13  14  15  16  17| 45  46  47|       |  o
  *  |  | 18  19  20  21  22  23  24  25  26| 54  55  56|       |  w
  *     | 27  28  29  30  31  32  33  34  35| 63  64  65|       |  0
  *  s  +-----------------------+-----------------------+-------+
  *  l  |       ..group 1       |        group 2..      |       |
  *  i  +-----------------------+-----------------------+-------+
  *  c  | 39  40  41  42  43  44| 72  73  74  75  76  77|       |  r
  *  e  | 48  49  50  51  52  53| 81  82  83  84  85  86|       |  o
  *  0  | 57  58  59  60  61  62| 90  91  92  93  94  95|       |  w
  *     | 66  67  68  69  70  71| 99 100 101 102 103 104|       |  1
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  |..group 2  |            group 3                |       |
  *  |  +-----------+-----------+-----------------------+-------+
  *  |  | 78  79  80|108 109 110 111 112 113 114 115 116|       |  r
  *  |  | 87  88  89|117 118 119 120 121 122 123 124 125|       |  o
  *  |  | 96  97  98|126 127 128 129 130 131 132 133 134|       |  w
  *  v  |105 106 107|135 136 137 138 139 140 141 142 143|       |  2
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 4              |  group 5..|       | row 3
  *  i  +-----------------------+-----------+-----------+-------|
  *  c  |       ..group 5       |        group 6..      |       | row 4
  *  e  +-----------+-----------+-----------------------+-------+
  *  1  |..group 6  |            group 7                |       | row 5
  *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *     | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
  *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
  *  l  |              group 8              |  group 9..|       | row 6
  *  i  +-----------------------------------------------+-------|
  *  c  |       ..group 9       |        group 10..     |       | row 7
  *  e  +-----------------------+-----------------------+-------+
  *  2  |..group 10 |            group 11               |       | row 8
  *     +-----------+-----------------------------------+-------+
  *
  * This layout has several advantages over requiring that each row contain
  * a whole number of groups.
  *
  * 1. The group count is not a relevant parameter when defining a dRAID
  *    layout. Only the group width is needed, and *all* groups will have
  *    the desired size.
  *
  * 2. All possible group widths (<= physical disk count) can be supported.
  *
  * 3. The logic within vdev_draid.c is simplified when the group width is
  *    the same for all groups (although some of the logic around computing
  *    permutation numbers and drive offsets is more complicated).
  *
  * N.B. The following array describes all valid dRAID permutation maps.
  * Each row is used to generate a permutation map for a different number
  * of children from a unique seed. The seeds were generated and carefully
  * evaluated by the 'draid' utility in order to provide balanced mappings.
  * In addition to the seed a checksum of the in-memory mapping is stored
  * for verification.
  *
  * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
  * with a given permutation map) is the ratio of the amounts of I/O that will
  * be sent to the least and most busy disks when resilvering. The average
  * imbalance ratio (of a given number of disks and permutation map) is the
  * average of the ratios of all possible single and double disk failures.
  *
  * In order to achieve a low imbalance ratio the number of permutations in
  * the mapping must be significantly larger than the number of children.
  * For dRAID the number of permutations has been limited to 512 to minimize
  * the map size. This does result in a gradually increasing imbalance ratio
  * as seen in the table below. Increasing the number of permutations for
  * larger child counts would reduce the imbalance ratio. However, in practice
  * when there are a large number of children each child is responsible for
  * fewer total IOs so it's less of a concern.
  *
  * Note these values are hard coded and must never be changed.  Existing
  * pools depend on the same mapping always being generated in order to
  * read and write from the correct locations.  Any change would make
  * existing pools completely inaccessible.
  */
 static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
 	{   2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d },	/* 1.000 */
 	{   3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 },	/* 1.000 */
 	{   4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 },	/* 1.000 */
 	{   5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 },	/* 1.010 */
 	{   6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 },	/* 1.031 */
 	{   7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee },	/* 1.043 */
 	{   8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 },	/* 1.059 */
 	{   9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 },	/* 1.056 */
 	{  10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 },	/* 1.072 */
 	{  11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c },	/* 1.083 */
 	{  12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e },	/* 1.097 */
 	{  13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 },	/* 1.100 */
 	{  14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 },	/* 1.121 */
 	{  15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 },	/* 1.103 */
 	{  16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 },	/* 1.111 */
 	{  17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe },	/* 1.133 */
 	{  18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 },	/* 1.131 */
 	{  19, 256, 0x892e343f2f31d690, 0x00000029eb392835 },	/* 1.130 */
 	{  20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c },	/* 1.141 */
 	{  21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 },	/* 1.139 */
 	{  22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 },	/* 1.150 */
 	{  23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f },	/* 1.174 */
 	{  24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 },	/* 1.168 */
 	{  25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 },	/* 1.180 */
 	{  26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba },	/* 1.226 */
 	{  27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 },	/* 1.228 */
 	{  28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c },	/* 1.217 */
 	{  29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c },	/* 1.239 */
 	{  30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 },	/* 1.238 */
 	{  31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f },	/* 1.273 */
 	{  32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 },	/* 1.191 */
 	{  33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 },	/* 1.199 */
 	{  34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 },	/* 1.195 */
 	{  35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 },	/* 1.201 */
 	{  36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef },	/* 1.194 */
 	{  37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 },	/* 1.237 */
 	{  38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 },	/* 1.242 */
 	{  39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd },	/* 1.231 */
 	{  40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 },	/* 1.233 */
 	{  41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 },	/* 1.271 */
 	{  42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 },	/* 1.263 */
 	{  43, 512, 0xbaa5125faa781854, 0x000001c76789e278 },	/* 1.270 */
 	{  44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb },	/* 1.281 */
 	{  45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 },	/* 1.282 */
 	{  46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b },	/* 1.286 */
 	{  47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 },	/* 1.329 */
 	{  48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b },	/* 1.286 */
 	{  49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 },	/* 1.322 */
 	{  50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 },	/* 1.335 */
 	{  51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 },	/* 1.305 */
 	{  52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf },	/* 1.330 */
 	{  53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 },	/* 1.365 */
 	{  54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 },	/* 1.334 */
 	{  55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 },	/* 1.364 */
 	{  56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e },	/* 1.374 */
 	{  57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 },	/* 1.363 */
 	{  58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 },	/* 1.401 */
 	{  59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c },	/* 1.392 */
 	{  60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 },	/* 1.360 */
 	{  61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd },	/* 1.396 */
 	{  62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c },	/* 1.453 */
 	{  63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 },	/* 1.437 */
 	{  64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 },	/* 1.402 */
 	{  65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 },	/* 1.459 */
 	{  66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 },	/* 1.423 */
 	{  67, 512, 0x910b9714f698a877, 0x00000451ea65d5db },	/* 1.447 */
 	{  68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 },	/* 1.450 */
 	{  69, 512, 0x836d4968fbaa3706, 0x000004954068a380 },	/* 1.455 */
 	{  70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d },	/* 1.463 */
 	{  71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 },	/* 1.463 */
 	{  72, 512, 0x42763a680d5bed8e, 0x000005084275c680 },	/* 1.452 */
 	{  73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab },	/* 1.498 */
 	{  74, 512, 0x9fa08548b1621a44, 0x0000054708019247 },	/* 1.526 */
 	{  75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 },	/* 1.491 */
 	{  76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 },	/* 1.470 */
 	{  77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 },	/* 1.527 */
 	{  78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 },	/* 1.509 */
 	{  79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e },	/* 1.569 */
 	{  80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c },	/* 1.555 */
 	{  81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 },	/* 1.509 */
 	{  82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 },	/* 1.596 */
 	{  83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e },	/* 1.568 */
 	{  84, 512, 0xba02545069ddc6dc, 0x000006d19861364f },	/* 1.541 */
 	{  85, 512, 0x447c73192c35073e, 0x000006fce315ce35 },	/* 1.623 */
 	{  86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b },	/* 1.620 */
 	{  87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 },	/* 1.597 */
 	{  88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b },	/* 1.575 */
 	{  89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc },	/* 1.627 */
 	{  90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb },	/* 1.596 */
 	{  91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 },	/* 1.622 */
 	{  92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e },	/* 1.695 */
 	{  93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c },	/* 1.605 */
 	{  94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc },	/* 1.625 */
 	{  95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 },	/* 1.687 */
 	{  96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a },	/* 1.621 */
 	{  97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 },	/* 1.699 */
 	{  98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b },	/* 1.688 */
 	{  99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce },	/* 1.642 */
 	{ 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc },	/* 1.683 */
 	{ 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 },	/* 1.755 */
 	{ 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 },	/* 1.692 */
 	{ 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 },	/* 1.747 */
 	{ 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 },	/* 1.751 */
 	{ 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 },	/* 1.751 */
 	{ 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f },	/* 1.726 */
 	{ 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d },	/* 1.788 */
 	{ 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 },	/* 1.740 */
 	{ 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 },	/* 1.780 */
 	{ 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 },	/* 1.836 */
 	{ 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 },	/* 1.778 */
 	{ 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 },	/* 1.831 */
 	{ 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df },	/* 1.825 */
 	{ 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 },	/* 1.826 */
 	{ 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 },	/* 1.843 */
 	{ 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d },	/* 1.826 */
 	{ 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b },	/* 1.803 */
 	{ 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 },	/* 1.857 */
 	{ 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 },	/* 1.877 */
 	{ 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 },	/* 1.849 */
 	{ 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d },	/* 1.867 */
 	{ 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 },	/* 1.978 */
 	{ 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d },	/* 1.947 */
 	{ 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea },	/* 1.865 */
 	{ 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f },	/* 1.881 */
 	{ 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b },	/* 1.882 */
 	{ 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e },	/* 1.867 */
 	{ 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e },	/* 1.972 */
 	{ 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 },	/* 1.896 */
 	{ 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d },	/* 1.965 */
 	{ 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 },	/* 1.963 */
 	{ 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 },	/* 1.925 */
 	{ 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 },	/* 1.862 */
 	{ 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 },	/* 2.042 */
 	{ 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 },	/* 1.935 */
 	{ 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 },	/* 2.005 */
 	{ 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c },	/* 2.041 */
 	{ 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 },	/* 1.997 */
 	{ 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 },	/* 1.996 */
 	{ 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d },	/* 2.053 */
 	{ 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a },	/* 1.971 */
 	{ 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 },	/* 2.018 */
 	{ 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd },	/* 1.961 */
 	{ 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 },	/* 2.046 */
 	{ 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb },	/* 1.968 */
 	{ 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 },	/* 2.143 */
 	{ 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 },	/* 2.064 */
 	{ 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 },	/* 2.023 */
 	{ 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c },	/* 2.136 */
 	{ 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 },	/* 2.063 */
 	{ 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 },	/* 1.974 */
 	{ 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 },	/* 2.210 */
 	{ 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a },	/* 2.006 */
 	{ 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 },	/* 2.193 */
 	{ 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 },	/* 2.163 */
 	{ 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc },	/* 2.046 */
 	{ 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 },	/* 2.084 */
 	{ 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 },	/* 2.264 */
 	{ 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 },	/* 2.074 */
 	{ 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 },	/* 2.282 */
 	{ 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf },	/* 2.148 */
 	{ 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 },	/* 2.355 */
 	{ 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 },	/* 2.164 */
 	{ 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a },	/* 2.393 */
 	{ 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 },	/* 2.178 */
 	{ 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc },	/* 2.334 */
 	{ 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b },	/* 2.266 */
 	{ 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 },	/* 2.304 */
 	{ 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d },	/* 2.218 */
 	{ 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff },	/* 2.377 */
 	{ 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 },	/* 2.155 */
 	{ 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 },	/* 2.404 */
 	{ 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 },	/* 2.205 */
 	{ 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d },	/* 2.359 */
 	{ 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 },	/* 2.158 */
 	{ 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b },	/* 2.614 */
 	{ 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc },	/* 2.239 */
 	{ 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc },	/* 2.493 */
 	{ 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c },	/* 2.327 */
 	{ 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 },	/* 2.231 */
 	{ 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c },	/* 2.237 */
 	{ 182, 512, 0xe6035defea48f933, 0x00002038e3346658 },	/* 2.691 */
 	{ 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e },	/* 2.170 */
 	{ 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 },	/* 2.600 */
 	{ 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc },	/* 2.391 */
 	{ 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 },	/* 2.677 */
 	{ 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c },	/* 2.410 */
 	{ 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 },	/* 2.776 */
 	{ 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 },	/* 2.266 */
 	{ 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 },	/* 2.717 */
 	{ 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c },	/* 2.474 */
 	{ 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 },	/* 2.673 */
 	{ 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 },	/* 2.420 */
 	{ 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 },	/* 2.898 */
 	{ 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c },	/* 2.363 */
 	{ 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e },	/* 2.747 */
 	{ 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 },	/* 2.531 */
 	{ 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 },	/* 2.707 */
 	{ 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 },	/* 2.315 */
 	{ 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf },	/* 3.012 */
 	{ 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 },	/* 2.378 */
 	{ 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 },	/* 2.969 */
 	{ 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d },	/* 2.594 */
 	{ 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd },	/* 2.763 */
 	{ 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 },	/* 2.457 */
 	{ 206, 512, 0xc02fc96684715a16, 0x0000297515608601 },	/* 3.057 */
 	{ 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 },	/* 2.590 */
 	{ 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b },	/* 3.047 */
 	{ 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 },	/* 2.676 */
 	{ 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 },	/* 2.993 */
 	{ 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 },	/* 2.457 */
 	{ 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 },	/* 3.182 */
 	{ 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 },	/* 2.563 */
 	{ 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 },	/* 3.025 */
 	{ 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f },	/* 2.730 */
 	{ 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 },	/* 3.036 */
 	{ 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 },	/* 2.722 */
 	{ 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 },	/* 3.356 */
 	{ 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 },	/* 2.697 */
 	{ 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 },	/* 2.979 */
 	{ 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 },	/* 2.858 */
 	{ 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e },	/* 3.258 */
 	{ 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 },	/* 2.693 */
 	{ 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 },	/* 3.259 */
 	{ 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c },	/* 2.733 */
 	{ 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 },	/* 3.235 */
 	{ 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 },	/* 2.983 */
 	{ 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e },	/* 3.308 */
 	{ 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 },	/* 2.715 */
 	{ 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f },	/* 3.540 */
 	{ 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 },	/* 2.779 */
 	{ 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c },	/* 3.084 */
 	{ 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc },	/* 2.987 */
 	{ 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae },	/* 3.341 */
 	{ 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 },	/* 2.793 */
 	{ 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 },	/* 3.518 */
 	{ 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 },	/* 2.962 */
 	{ 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 },	/* 3.196 */
 	{ 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 },	/* 2.914 */
 	{ 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 },	/* 3.408 */
 	{ 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 },	/* 2.903 */
 	{ 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 },	/* 3.778 */
 	{ 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c },	/* 3.026 */
 	{ 244, 512, 0xc740263f0301efa8, 0x00003a147146512d },	/* 3.347 */
 	{ 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d },	/* 3.212 */
 	{ 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 },	/* 3.482 */
 	{ 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 },	/* 3.146 */
 	{ 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f },	/* 3.626 */
 	{ 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 },	/* 2.952 */
 	{ 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e },	/* 3.463 */
 	{ 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 },	/* 3.131 */
 	{ 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c },	/* 3.538 */
 	{ 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac },	/* 2.974 */
 	{ 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 },	/* 3.843 */
 	{ 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 },	/* 3.088 */
 };
 
 /*
  * Verify the map is valid. Each device index must appear exactly
  * once in every row, and the permutation array checksum must match.
  */
 static int
 verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
     uint64_t checksum)
 {
 	int countssz = sizeof (uint16_t) * children;
 	uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
 
 	for (int i = 0; i < nperms; i++) {
 		for (int j = 0; j < children; j++) {
 			uint8_t val = perms[(i * children) + j];
 
 			if (val >= children || counts[val] != i) {
 				kmem_free(counts, countssz);
 				return (EINVAL);
 			}
 
 			counts[val]++;
 		}
 	}
 
 	if (checksum != 0) {
 		int permssz = sizeof (uint8_t) * children * nperms;
 		zio_cksum_t cksum;
 
 		fletcher_4_native_varsize(perms, permssz, &cksum);
 
 		if (checksum != cksum.zc_word[0]) {
 			kmem_free(counts, countssz);
 			return (ECKSUM);
 		}
 	}
 
 	kmem_free(counts, countssz);
 
 	return (0);
 }
 
 /*
  * Generate the permutation array for the draid_map_t.  These maps control
  * the placement of all data in a dRAID.  Therefore it's critical that the
  * seed always generates the same mapping.  We provide our own pseudo-random
  * number generator for this purpose.
  */
 int
 vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
 {
 	VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
 	VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
 	VERIFY3U(map->dm_seed, !=, 0);
 	VERIFY3U(map->dm_nperms, !=, 0);
 	VERIFY3P(map->dm_perms, ==, NULL);
 
 #ifdef _KERNEL
 	/*
 	 * The kernel code always provides both a map_seed and checksum.
 	 * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
 	 * a zero checksum when generating new candidate maps.
 	 */
 	VERIFY3U(map->dm_checksum, !=, 0);
 #endif
 	uint64_t children = map->dm_children;
 	uint64_t nperms = map->dm_nperms;
 	int rowsz = sizeof (uint8_t) * children;
 	int permssz = rowsz * nperms;
 	uint8_t *perms;
 
 	/* Allocate the permutation array */
 	perms = vmem_alloc(permssz, KM_SLEEP);
 
 	/* Setup an initial row with a known pattern */
 	uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
 	for (int i = 0; i < children; i++)
 		initial_row[i] = i;
 
 	uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
 	uint8_t *current_row, *previous_row = initial_row;
 
 	/*
 	 * Perform a Fisher-Yates shuffle of each row using the previous
 	 * row as the starting point.  An initial_row with known pattern
 	 * is used as the input for the first row.
 	 */
 	for (int i = 0; i < nperms; i++) {
 		current_row = &perms[i * children];
 		memcpy(current_row, previous_row, rowsz);
 
 		for (int j = children - 1; j > 0; j--) {
 			uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
 			uint8_t val = current_row[j];
 			current_row[j] = current_row[k];
 			current_row[k] = val;
 		}
 
 		previous_row = current_row;
 	}
 
 	kmem_free(initial_row, rowsz);
 
 	int error = verify_perms(perms, children, nperms, map->dm_checksum);
 	if (error) {
 		vmem_free(perms, permssz);
 		return (error);
 	}
 
 	*permsp = perms;
 
 	return (0);
 }
 
 /*
  * Lookup the fixed draid_map_t for the requested number of children.
  */
 int
 vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
 {
 	for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
 		if (draid_maps[i].dm_children == children) {
 			*mapp = &draid_maps[i];
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 /*
  * Lookup the permutation array and iteration id for the provided offset.
  */
 static void
 vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
     uint8_t **base, uint64_t *iter)
 {
 	uint64_t ncols = vdc->vdc_children;
 	uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
 
 	*base = vdc->vdc_perms + (poff / ncols) * ncols;
 	*iter = poff % ncols;
 }
 
 static inline uint64_t
 vdev_draid_permute_id(vdev_draid_config_t *vdc,
     uint8_t *base, uint64_t iter, uint64_t index)
 {
 	return ((base[index] + iter) % vdc->vdc_children);
 }
 
 /*
  * Return the asize which is the psize rounded up to a full group width.
  * i.e. vdev_draid_psize_to_asize().
  */
 static uint64_t
-vdev_draid_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
+vdev_draid_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	(void) txg;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_ashift;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
 	uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
 
 	ASSERT3U(asize, !=, 0);
 	ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
 
 	return (asize);
 }
 
 /*
  * Deflate the asize to the psize, this includes stripping parity.
  */
 uint64_t
-vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
+vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
 {
+	(void) txg;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT0(asize % vdc->vdc_groupwidth);
 
 	return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
 }
 
 /*
  * Convert a logical offset to the corresponding group number.
  */
 static uint64_t
 vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (offset / vdc->vdc_groupsz);
 }
 
 /*
  * Convert a group number to the logical starting offset for that group.
  */
 static uint64_t
 vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (group * vdc->vdc_groupsz);
 }
 
 /*
  * Full stripe writes.  When writing, all columns (D+P) are required.  Parity
  * is calculated over all the columns, including empty zero filled sectors,
  * and each is written to disk.  While only the data columns are needed for
  * a normal read, all of the columns are required for reconstruction when
  * performing a sequential resilver.
  *
  * For "big columns" it's sufficient to map the correct range of the zio ABD.
  * Partial columns require allocating a gang ABD in order to zero fill the
  * empty sectors.  When the column is empty a zero filled sector must be
  * mapped.  In all cases the data ABDs must be the same size as the parity
  * ABDs (e.g. rc->rc_size == parity_size).
  */
 static void
 vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small write), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			rc->rc_abd = abd_get_zeros(skip_size);
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
 			    B_TRUE);
 		}
 
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
 
 		abd_off += rc->rc_size;
 		rc->rc_size = parity_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Scrub/resilver reads.  In order to store the contents of the skip sectors
  * an additional ABD is allocated.  The columns are handled in the same way
  * as a full stripe write except instead of using the zero ABD the newly
  * allocated skip ABD is used to back the skip sectors.  In all cases the
  * data ABD must be the same size as the parity ABDs.
  */
 static void
 vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t abd_off = abd_offset;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column" */
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 		} else {
 			/* short data column, add a skip sector */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 		}
 
 		uint64_t abd_size = abd_get_size(rc->rc_abd);
 		ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
 
 		/*
 		 * Increase rc_size so the skip ABD is included in subsequent
 		 * parity calculations.
 		 */
 		abd_off += rc->rc_size;
 		rc->rc_size = abd_size;
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Normal reads.  In this common case only the columns containing data
  * are read in to the zio ABDs.  Neither the parity columns or empty skip
  * sectors are read unless the checksum fails verification.  In which case
  * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
  * the raid map in order to allow reconstruction using the parity data and
  * skip sectors.
  */
 static void
 vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
 {
 	uint64_t abd_off = abd_offset;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size > 0) {
 			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 			    zio->io_abd, abd_off, rc->rc_size);
 			abd_off += rc->rc_size;
 		}
 	}
 
 	IMPLY(abd_offset != 0, abd_off == zio->io_size);
 }
 
 /*
  * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
  * difference is that an ABD is allocated to back skip sectors so they may
  * be read in to memory, verified, and repaired if needed.
  */
 void
 vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, ==, NULL);
 
 	if (rr->rr_nempty > 0) {
 		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
 		    B_FALSE);
 	}
 
 	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size == 0) {
 			/* empty data column (small read), add a skip sector */
 			ASSERT3U(skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, ==, NULL);
 			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
 			    skip_off, skip_size);
 			skip_off += skip_size;
 		} else if (rc->rc_size == parity_size) {
 			/* this is a "big column", nothing to add */
 			ASSERT3P(rc->rc_abd, !=, NULL);
 		} else {
 			/*
 			 * short data column, add a skip sector and clear
 			 * rc_tried to force the entire column to be re-read
 			 * thereby including the missing skip sector data
 			 * which is needed for reconstruction.
 			 */
 			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
 			ASSERT3U(rr->rr_nempty, !=, 0);
 			ASSERT3P(rc->rc_abd, !=, NULL);
 			ASSERT(!abd_is_gang(rc->rc_abd));
 			abd_t *read_abd = rc->rc_abd;
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
 			abd_gang_add(rc->rc_abd, abd_get_offset_size(
 			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
 			skip_off += skip_size;
 			rc->rc_tried = 0;
 		}
 
 		/*
 		 * Increase rc_size so the empty ABD is included in subsequent
 		 * parity calculations.
 		 */
 		rc->rc_size = parity_size;
 	}
 
 	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
 }
 
 /*
  * Verify that all empty sectors are zero filled before using them to
  * calculate parity.  Otherwise, silent corruption in an empty sector will
  * result in bad parity being generated.  That bad parity will then be
  * considered authoritative and overwrite the good parity on disk.  This
  * is possible because the checksum is only calculated over the data,
  * thus it cannot be used to detect damage in empty sectors.
  */
 int
 vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr)
 {
 	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
 	uint64_t parity_size = rr->rr_col[0].rc_size;
 	uint64_t skip_off = parity_size - skip_size;
 	uint64_t empty_off = 0;
 	int ret = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 	ASSERT3P(rr->rr_abd_empty, !=, NULL);
 	ASSERT3U(rr->rr_bigcols, >, 0);
 
 	void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP);
 
 	for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		ASSERT3U(rc->rc_size, ==, parity_size);
 
 		if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off,
 		    skip_size) != 0) {
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			abd_zero_off(rc->rc_abd, skip_off, skip_size);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 
 		empty_off += skip_size;
 	}
 
 	ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty));
 
 	kmem_free(zero_buf, skip_size);
 
 	return (ret);
 }
 
 /*
  * Given a logical address within a dRAID configuration, return the physical
  * address on the first drive in the group that this address maps to
  * (at position 'start' in permutation number 'perm').
  */
 static uint64_t
 vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
     uint64_t *perm, uint64_t *start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	/* b is the dRAID (parent) sector offset. */
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t b_offset = logical_offset >> ashift;
 
 	/*
 	 * The height of a row in units of the vdev's minimum sector size.
 	 * This is the amount of data written to each disk of each group
 	 * in a given permutation.
 	 */
 	uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
 
 	/*
 	 * We cycle through a disk permutation every groupsz * ngroups chunk
 	 * of address space. Note that ngroups * groupsz must be a multiple
 	 * of the number of data drives (ndisks) in order to guarantee
 	 * alignment. So, for example, if our row height is 16MB, our group
 	 * size is 10, and there are 13 data drives in the draid, then ngroups
 	 * will be 13, we will change permutation every 2.08GB and each
 	 * disk will have 160MB of data per chunk.
 	 */
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t ngroups = vdc->vdc_ngroups;
 	uint64_t ndisks = vdc->vdc_ndisks;
 
 	/*
 	 * groupstart is where the group this IO will land in "starts" in
 	 * the permutation array.
 	 */
 	uint64_t group = logical_offset / vdc->vdc_groupsz;
 	uint64_t groupstart = (group * groupwidth) % ndisks;
 	ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
 	*start = groupstart;
 
 	/* b_offset is the sector offset within a group chunk */
 	b_offset = b_offset % (rowheight_sectors * groupwidth);
 	ASSERT0(b_offset % groupwidth);
 
 	/*
 	 * Find the starting byte offset on each child vdev:
 	 * - within a permutation there are ngroups groups spread over the
 	 *   rows, where each row covers a slice portion of the disk
 	 * - each permutation has (groupwidth * ngroups) / ndisks rows
 	 * - so each permutation covers rows * slice portion of the disk
 	 * - so we need to find the row where this IO group target begins
 	 */
 	*perm = group / ngroups;
 	uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
 	    (((group % ngroups) * groupwidth) / ndisks);
 
 	return (((rowheight_sectors * row) +
 	    (b_offset / groupwidth)) << ashift);
 }
 
 static uint64_t
 vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
     uint64_t abd_offset, uint64_t abd_size)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t io_size = abd_size;
-	uint64_t io_asize = vdev_draid_asize(vd, io_size, 0);
+	uint64_t io_asize = vdev_draid_psize_to_asize(vd, io_size, 0);
 	uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
 	uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
 
 	/*
 	 * Limit the io_size to the space remaining in the group.  A second
 	 * row in the raidz_map_t is created for the remainder.
 	 */
 	if (io_offset + io_asize > start_offset) {
 		io_size = vdev_draid_asize_to_psize(vd,
-		    start_offset - io_offset);
+		    start_offset - io_offset, 0);
 	}
 
 	/*
 	 * At most a block may span the logical end of one group and the start
 	 * of the next group. Therefore, at the end of a group the io_size must
 	 * span the group width evenly and the remainder must be aligned to the
 	 * start of the next group.
 	 */
 	IMPLY(abd_offset == 0 && io_size < zio->io_size,
 	    (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
 	IMPLY(abd_offset != 0,
 	    vdev_draid_group_to_offset(vd, group) == io_offset);
 
 	/* Lookup starting byte offset on each child vdev */
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    io_offset, &perm, &groupstart);
 
 	/*
 	 * If there is less than groupwidth drives available after the group
 	 * start, the group is going to wrap onto the next row. 'wrap' is the
 	 * group disk number that starts on the next row.
 	 */
 	uint64_t ndisks = vdc->vdc_ndisks;
 	uint64_t groupwidth = vdc->vdc_groupwidth;
 	uint64_t wrap = groupwidth;
 
 	if (groupstart + groupwidth > ndisks)
 		wrap = ndisks - groupstart;
 
 	/* The io size in units of the vdev's minimum sector size. */
 	const uint64_t psize = io_size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = psize / vdc->vdc_ndata;
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = psize - q * vdc->vdc_ndata;
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
 	ASSERT3U(bc, <, groupwidth);
 
 	/* The total number of data and parity sectors for this I/O. */
 	uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
 
 	ASSERT3U(vdc->vdc_nparity, >, 0);
 
 	raidz_row_t *rr = vdev_raidz_row_alloc(groupwidth, zio);
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = vdc->vdc_nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = io_offset;
 	rr->rr_size = io_size;
 #endif
 	*rrp = rr;
 
 	uint8_t *base;
 	uint64_t iter, asize = 0;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 	for (uint64_t i = 0; i < groupwidth; i++) {
 		raidz_col_t *rc = &rr->rr_col[i];
 		uint64_t c = (groupstart + i) % ndisks;
 
 		/* increment the offset if we wrap to the next row */
 		if (i == wrap)
 			physical_offset += VDEV_DRAID_ROWHEIGHT;
 
 		rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
 		rc->rc_offset = physical_offset;
 
 		if (q == 0 && i >= bc)
 			rc->rc_size = 0;
 		else if (i < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rr->rr_nempty = roundup(tot, groupwidth) - tot;
 	IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
 
 	/* Allocate buffers for the parity columns */
 	for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Map buffers for data columns and allocate/map buffers for skip
 	 * sectors.  There are three distinct cases for dRAID which are
 	 * required to support sequential rebuild.
 	 */
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_draid_map_alloc_write(zio, abd_offset, rr);
 	} else if ((rr->rr_nempty > 0) &&
 	    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 		vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
 	} else {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 		vdev_draid_map_alloc_read(zio, abd_offset, rr);
 	}
 
 	return (io_size);
 }
 
 /*
  * Allocate the raidz mapping to be applied to the dRAID I/O.  The parity
  * calculations for dRAID are identical to raidz however there are a few
  * differences in the layout.
  *
  * - dRAID always allocates a full stripe width. Any extra sectors due
  *   this padding are zero filled and written to disk. They will be read
  *   back during a scrub or repair operation since they are included in
  *   the parity calculation. This property enables sequential resilvering.
  *
  * - When the block at the logical offset spans redundancy groups then two
  *   rows are allocated in the raidz_map_t. One row resides at the end of
  *   the first group and the other at the start of the following group.
  */
 static raidz_map_t *
 vdev_draid_map_alloc(zio_t *zio)
 {
 	raidz_row_t *rr[2];
 	uint64_t abd_offset = 0;
 	uint64_t abd_size = zio->io_size;
 	uint64_t io_offset = zio->io_offset;
 	uint64_t size;
 	int nrows = 1;
 
 	size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
 	    abd_offset, abd_size);
 	if (size < abd_size) {
 		vdev_t *vd = zio->io_vd;
 
-		io_offset += vdev_draid_asize(vd, size, 0);
+		io_offset += vdev_draid_psize_to_asize(vd, size, 0);
 		abd_offset += size;
 		abd_size -= size;
 		nrows++;
 
 		ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
 		    vd, vdev_draid_offset_to_group(vd, io_offset)));
 		ASSERT3U(abd_offset, <, zio->io_size);
 		ASSERT3U(abd_size, !=, 0);
 
 		size = vdev_draid_map_alloc_row(zio, &rr[1],
 		    io_offset, abd_offset, abd_size);
 		VERIFY3U(size, ==, abd_size);
 	}
 
 	raidz_map_t *rm;
 	rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
 	rm->rm_ops = vdev_raidz_math_get_ops();
 	rm->rm_nrows = nrows;
 	rm->rm_row[0] = rr[0];
 	if (nrows == 2)
 		rm->rm_row[1] = rr[1];
 	return (rm);
 }
 
 /*
  * Given an offset into a dRAID return the next group width aligned offset
  * which can be used to start an allocation.
  */
 static uint64_t
 vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
 }
 
 /*
  * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
  * rounded down to the last full slice.  So each child must provide at least
  * 1 / (children - nspares) of its asize.
  */
 static uint64_t
 vdev_draid_min_asize(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (VDEV_DRAID_REFLOW_RESERVE +
 	    (vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
 }
 
 /*
  * When using dRAID the minimum allocation size is determined by the number
  * of data disks in the redundancy group.  Full stripes are always used.
  */
 static uint64_t
 vdev_draid_min_alloc(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	return (vdc->vdc_ndata << vd->vdev_ashift);
 }
 
 /*
  * Returns true if the txg range does not exist on any leaf vdev.
  *
  * A dRAID spare does not fit into the DTL model. While it has child vdevs
  * there is no redundancy among them, and the effective child vdev is
  * determined by offset. Essentially we do a vdev_dtl_reassess() on the
  * fly by replacing a dRAID spare with the child vdev under the offset.
  * Note that it is a recursive process because the child vdev can be
  * another dRAID spare and so on.
  */
 boolean_t
 vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child
 		 * contains the txg range the data it is not missing.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (!vdev_draid_missing(cvd, physical_offset,
 			    txg, size))
 				return (B_FALSE);
 		}
 
 		return (B_TRUE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_missing(vd, physical_offset,
 		    txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Returns true if the txg is only partially replicated on the leaf vdevs.
  */
 static boolean_t
 vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
     uint64_t size)
 {
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 		/*
 		 * Check all of the readable children, if any child is
 		 * missing the txg range then it is partially replicated.
 		 */
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_partial(cvd, physical_offset, txg, size))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		/*
 		 * When sequentially resilvering we don't have a proper
 		 * txg range so instead we must presume all txgs are
 		 * missing on this vdev until the resilver completes.
 		 */
 		if (vd->vdev_rebuild_txg != 0)
 			return (B_TRUE);
 
 		/*
 		 * DTL_MISSING is set for all prior txgs when a resilver
 		 * is started in spa_vdev_attach().
 		 */
 		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Consult the DTL on the relevant vdev. Either a vdev
 		 * leaf or spare/replace mirror child may be returned so
 		 * we must recursively call vdev_draid_missing_impl().
 		 */
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_TRUE);
 
 		return (vdev_draid_partial(vd, physical_offset, txg, size));
 	}
 
 	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Determine if the vdev is readable at the given offset.
  */
 boolean_t
 vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 	}
 
 	if (vd->vdev_ops == &vdev_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops) {
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			if (!vdev_readable(cvd))
 				continue;
 
 			if (vdev_draid_readable(cvd, physical_offset))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 
 	return (vdev_readable(vd));
 }
 
 /*
  * Returns the first distributed spare found under the provided vdev tree.
  */
 static vdev_t *
 vdev_draid_find_spare(vdev_t *vd)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
 		if (svd != NULL)
 			return (svd);
 	}
 
 	return (NULL);
 }
 
 /*
  * Returns B_TRUE if the passed in vdev is currently "faulted".
  * Faulted, in this context, means that the vdev represents a
  * replacing or sparing vdev tree.
  */
 static boolean_t
 vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
 {
 	if (vd->vdev_ops == &vdev_draid_spare_ops) {
 		vd = vdev_draid_spare_get_child(vd, physical_offset);
 		if (vd == NULL)
 			return (B_FALSE);
 
 		/*
 		 * After resolving the distributed spare to a leaf vdev
 		 * check the parent to determine if it's "faulted".
 		 */
 		vd = vd->vdev_parent;
 	}
 
 	return (vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Determine if the dRAID block at the logical offset is degraded.
  * Used by sequential resilver.
  */
 static boolean_t
 vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Group contains a faulted vdev. */
 		if (vdev_draid_faulted(cvd, physical_offset))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Determine if the txg is missing.  Used by healing resilver.
  */
 static boolean_t
 vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
     uint64_t size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
 
 	uint64_t groupstart, perm;
 	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
 	    offset, &perm, &groupstart);
 
 	uint8_t *base;
 	uint64_t iter;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
 		vdev_t *cvd = vd->vdev_child[cid];
 
 		/* Transaction group is known to be partially replicated. */
 		if (vdev_draid_partial(cvd, physical_offset, txg, size))
 			return (B_TRUE);
 
 		/*
 		 * Always check groups with active distributed spares
 		 * because any vdev failure in the pool will affect them.
 		 */
 		if (vdev_draid_find_spare(cvd) != NULL)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Find the smallest child asize and largest sector size to calculate the
  * available capacity.  Distributed spares are ignored since their capacity
  * is also based of the minimum child size in the top-level dRAID.
  */
 static void
 vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
     uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
 {
 	uint64_t logical_ashift = 0, physical_ashift = 0;
 	uint64_t asize = 0, max_asize = 0;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 
 		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
 		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_ops == &vdev_draid_spare_ops)
 			continue;
 		physical_ashift = vdev_best_ashift(logical_ashift,
 		    physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asizep = asize;
 	*max_asizep = max_asize;
 	*logical_ashiftp = logical_ashift;
 	*physical_ashiftp = physical_ashift;
 }
 
 /*
  * Open spare vdevs.
  */
 static boolean_t
 vdev_draid_open_spares(vdev_t *vd)
 {
 	return (vd->vdev_ops == &vdev_draid_spare_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 }
 
 /*
  * Open all children, excluding spares.
  */
 static boolean_t
 vdev_draid_open_children(vdev_t *vd)
 {
 	return (!vdev_draid_open_spares(vd));
 }
 
 /*
  * Open a top-level dRAID vdev.
  */
 static int
 vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_config_t *vdc =  vd->vdev_tsd;
 	uint64_t nparity = vdc->vdc_nparity;
 	int open_errors = 0;
 
 	if (nparity > VDEV_DRAID_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * First open the normal children then the distributed spares.  This
 	 * ordering is important to ensure the distributed spares calculate
 	 * the correct psize in the event that the dRAID vdevs were expanded.
 	 */
 	vdev_open_children_subset(vd, vdev_draid_open_children);
 	vdev_open_children_subset(vd, vdev_draid_open_spares);
 
 	/* Verify enough of the children are available to continue. */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_open_error != 0) {
 			if ((++open_errors) > nparity) {
 				vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 				return (SET_ERROR(ENXIO));
 			}
 		}
 	}
 
 	/*
 	 * Allocatable capacity is the sum of the space on all children less
 	 * the number of distributed spares rounded down to last full row
 	 * and then to the last full group. An additional 32MB of scratch
 	 * space is reserved at the end of each child for use by the dRAID
 	 * expansion feature.
 	 */
 	uint64_t child_asize, child_max_asize;
 	vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
 	    logical_ashift, physical_ashift);
 
 	/*
 	 * Should be unreachable since the minimum child size is 64MB, but
 	 * we want to make sure an underflow absolutely cannot occur here.
 	 */
 	if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
 	    child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 	child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
 	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
 
 	*asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 	*max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
 	    vdc->vdc_groupsz);
 
 	return (0);
 }
 
 /*
  * Close a top-level dRAID vdev.
  */
 static void
 vdev_draid_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range
  * given the following constraints.  A dRAID chunks may not:
  *
  * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
  * - Span dRAID redundancy groups.
  */
 static uint64_t
 vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t ashift = vd->vdev_ashift;
 	uint64_t ndata = vdc->vdc_ndata;
 	uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
 	ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
 
 	/* Chunks must evenly span all data columns in the group. */
 	psize = (((psize >> ashift) / ndata) * ndata) << ashift;
 	uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
 
 	/* Reduce the chunk size to the group space remaining. */
 	uint64_t group = vdev_draid_offset_to_group(vd, start);
 	uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
 	chunk_size = MIN(chunk_size, left);
 
 	ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
 	ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
 	    vdev_draid_offset_to_group(vd, start + chunk_size - 1));
 
 	return (chunk_size);
 }
 
 /*
  * Align the start of the metaslab to the group width and slightly reduce
  * its size to a multiple of the group width.  Since full stripe writes are
  * required by dRAID this space is unallocable.  Furthermore, aligning the
  * metaslab start is important for vdev initialize and TRIM which both operate
  * on metaslab boundaries which vdev_xlate() expects to be aligned.
  */
 static void
 vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 
 	uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
 	uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
 	uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
 
 	*ms_start = astart;
 	*ms_size = asize;
 
 	ASSERT0(*ms_start % sz);
 	ASSERT0(*ms_size % sz);
 }
 
 /*
  * Add virtual dRAID spares to the list of valid spares. In order to accomplish
  * this the existing array must be freed and reallocated with the additional
  * entries.
  */
 int
 vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
     uint64_t next_vdev_id)
 {
 	uint64_t draid_nspares = 0;
 	uint64_t ndraid = 0;
 	int error;
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_t *cvd = vd->vdev_child[i];
 
 		if (cvd->vdev_ops == &vdev_draid_ops) {
 			vdev_draid_config_t *vdc = cvd->vdev_tsd;
 			draid_nspares += vdc->vdc_nspares;
 			ndraid++;
 		}
 	}
 
 	if (draid_nspares == 0) {
 		*ndraidp = ndraid;
 		return (0);
 	}
 
 	nvlist_t **old_spares, **new_spares;
 	uint_t old_nspares;
 	error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &old_spares, &old_nspares);
 	if (error)
 		old_nspares = 0;
 
 	/* Allocate memory and copy of the existing spares. */
 	new_spares = kmem_alloc(sizeof (nvlist_t *) *
 	    (draid_nspares + old_nspares), KM_SLEEP);
 	for (uint_t i = 0; i < old_nspares; i++)
 		new_spares[i] = fnvlist_dup(old_spares[i]);
 
 	/* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
 	uint64_t n = old_nspares;
 	for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
 		vdev_t *cvd = vd->vdev_child[vdev_id];
 		char path[64];
 
 		if (cvd->vdev_ops != &vdev_draid_ops)
 			continue;
 
 		vdev_draid_config_t *vdc = cvd->vdev_tsd;
 		uint64_t nspares = vdc->vdc_nspares;
 		uint64_t nparity = vdc->vdc_nparity;
 
 		for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
 			memset(path, 0, sizeof (path));
 			(void) snprintf(path, sizeof (path) - 1,
 			    "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
 			    (u_longlong_t)nparity,
 			    (u_longlong_t)next_vdev_id + vdev_id,
 			    (u_longlong_t)spare_id);
 
 			nvlist_t *spare = fnvlist_alloc();
 			fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
 			fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
 			    VDEV_TYPE_DRAID_SPARE);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
 			    cvd->vdev_guid);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
 			    spare_id);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
 			fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
 			    cvd->vdev_ashift);
 
 			new_spares[n] = spare;
 			n++;
 		}
 	}
 
 	if (n > 0) {
 		(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t **)new_spares, n);
 	}
 
 	for (int i = 0; i < n; i++)
 		nvlist_free(new_spares[i]);
 
 	kmem_free(new_spares, sizeof (*new_spares) * n);
 	*ndraidp = ndraid;
 
 	return (0);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.
  */
 static boolean_t
 vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t asize = vdev_draid_asize(vd, psize, 0);
+	uint64_t asize = vdev_draid_psize_to_asize(vd, psize, 0);
 
 	if (phys_birth == TXG_UNKNOWN) {
 		/*
 		 * Sequential resilver.  There is no meaningful phys_birth
 		 * for this block, we can only determine if block resides
 		 * in a degraded group in which case it must be resilvered.
 		 */
 		ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
 		    vdev_draid_offset_to_group(vd, offset + asize - 1));
 
 		return (vdev_draid_group_degraded(vd, offset));
 	} else {
 		/*
 		 * Healing resilver.  TXGs not in DTL_PARTIAL are intact,
 		 * as are blocks in non-degraded groups.
 		 */
 		if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 			return (B_FALSE);
 
 		if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
 			return (B_TRUE);
 
 		/* The block may span groups in which case check both. */
 		if (vdev_draid_offset_to_group(vd, offset) !=
 		    vdev_draid_offset_to_group(vd, offset + asize - 1)) {
 			if (vdev_draid_group_missing(vd,
 			    offset + asize, phys_birth, 1))
 				return (B_TRUE);
 		}
 
 		return (B_FALSE);
 	}
 }
 
 static boolean_t
 vdev_draid_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_draid_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
 {
 #ifdef ZFS_DEBUG
 	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
-	    vdev_draid_asize(vd, rr->rr_size, 0);
+	    vdev_draid_psize_to_asize(vd, rr->rr_size, 0);
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
 #endif
 }
 
 /*
  * For write operations:
  * 1. Generate the parity data
  * 2. Create child zio write operations to each column's vdev, for both
  *    data and parity.  A gang ABD is allocated by vdev_draid_map_alloc()
  *    if a skip sector needs to be added to a column.
  */
 static void
 vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Empty columns are zero filled and included in the parity
 		 * calculation and therefore must be written.
 		 */
 		ASSERT3U(rc->rc_size, !=, 0);
 
 		/* Verify physical to logical translation */
 		vdev_draid_io_verify(vd, rr, c);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
 		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
 		    0, vdev_raidz_child_done, rc));
 	}
 }
 
 /*
  * For read operations:
  * 1. The vdev_draid_map_alloc() function will create a minimal raidz
  *    mapping for the read based on the zio->io_flags.  There are two
  *    possible mappings either 1) a normal read, or 2) a scrub/resilver.
  * 2. Create the zio read operations.  This will include all parity
  *    columns and skip sectors for a scrub/resilver.
  */
 static void
 vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/* Sequential rebuild must do IO at redundancy group boundary. */
 	IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last.  Any errors along the way will force us to read the parity.
 	 * For scrub/resilver IOs which verify skip sectors, a gang ABD will
 	 * have been allocated to store them and rc->rc_size is increased.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (!vdev_draid_readable(cvd, rc->rc_offset)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		/*
 		 * Empty columns may be read during vdev_draid_io_done().
 		 * Only skip them after the readable and missing checks
 		 * verify they are available.
 		 */
 		if (rc->rc_size == 0) {
 			rc->rc_skipped = 1;
 			continue;
 		}
 
 		if (zio->io_flags & ZIO_FLAG_RESILVER) {
 			vdev_t *svd;
 
 			/*
 			 * Sequential rebuilds need to always consider the data
 			 * on the child being rebuilt to be stale.  This is
 			 * important when all columns are available to aid
 			 * known reconstruction in identifing which columns
 			 * contain incorrect data.
 			 *
 			 * Furthermore, all repairs need to be constrained to
 			 * the devices being rebuilt because without a checksum
 			 * we cannot verify the data is actually correct and
 			 * performing an incorrect repair could result in
 			 * locking in damage and making the data unrecoverable.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				if (vdev_draid_rebuilding(cvd)) {
 					if (c >= rr->rr_firstdatacol)
 						rr->rr_missingdata++;
 					else
 						rr->rr_missingparity++;
 					rc->rc_error = SET_ERROR(ESTALE);
 					rc->rc_skipped = 1;
 					rc->rc_allow_repair = 1;
 					continue;
 				} else {
 					rc->rc_allow_repair = 0;
 				}
 			} else {
 				rc->rc_allow_repair = 1;
 			}
 
 			/*
 			 * If this child is a distributed spare then the
 			 * offset might reside on the vdev being replaced.
 			 * In which case this data must be written to the
 			 * new device.  Failure to do so would result in
 			 * checksum errors when the old device is detached
 			 * and the pool is scrubbed.
 			 */
 			if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
 				svd = vdev_draid_spare_get_child(svd,
 				    rc->rc_offset);
 				if (svd && (svd->vdev_ops == &vdev_spare_ops ||
 				    svd->vdev_ops == &vdev_replacing_ops)) {
 					rc->rc_force_repair = 1;
 
 					if (vdev_draid_rebuilding(svd))
 						rc->rc_allow_repair = 1;
 				}
 			}
 
 			/*
 			 * Always issue a repair IO to this child when its
 			 * a spare or replacing vdev with an active rebuild.
 			 */
 			if ((cvd->vdev_ops == &vdev_spare_ops ||
 			    cvd->vdev_ops == &vdev_replacing_ops) &&
 			    vdev_draid_rebuilding(cvd)) {
 				rc->rc_force_repair = 1;
 				rc->rc_allow_repair = 1;
 			}
 		}
 	}
 
 	/*
 	 * Either a parity or data column is missing this means a repair
 	 * may be attempted by vdev_draid_io_done().  Expand the raid map
 	 * to read in empty columns which are needed along with the parity
 	 * during reconstruction.
 	 */
 	if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
 	    rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
 		vdev_draid_map_alloc_empty(zio, rr);
 	}
 
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		if (rc->rc_error || rc->rc_size == 0)
 			continue;
 
 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 /*
  * Start an IO operation to a dRAID vdev.
  */
 static void
 vdev_draid_io_start(zio_t *zio)
 {
 	vdev_t *vd __maybe_unused = zio->io_vd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
 
 	raidz_map_t *rm = vdev_draid_map_alloc(zio);
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_write(zio, rm->rm_row[i]);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_draid_io_start_read(zio, rm->rm_row[i]);
 		}
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Complete an IO operation on a dRAID vdev.  The raidz logic can be applied
  * to dRAID since the layout is fully described by the raidz_map_t.
  */
 static void
 vdev_draid_io_done(zio_t *zio)
 {
 	vdev_raidz_io_done(zio);
 }
 
 static void
 vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 	ASSERT(vd->vdev_ops == &vdev_draid_ops);
 
 	if (faulted > vdc->vdc_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 static void
 vdev_draid_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
 
 	vdev_draid_config_t *vdc = raidvd->vdev_tsd;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* Make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 
 	uint64_t logical_start = logical_rs->rs_start;
 	uint64_t logical_end = logical_rs->rs_end;
 
 	/*
 	 * Unaligned ranges must be skipped. All metaslabs are correctly
 	 * aligned so this should not happen, but this case is handled in
 	 * case it's needed by future callers.
 	 */
 	uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
 	if (astart != logical_start) {
 		physical_rs->rs_start = logical_start;
 		physical_rs->rs_end = logical_start;
 		remain_rs->rs_start = MIN(astart, logical_end);
 		remain_rs->rs_end = logical_end;
 		return;
 	}
 
 	/*
 	 * Unlike with mirrors and raidz a dRAID logical range can map
 	 * to multiple non-contiguous physical ranges. This is handled by
 	 * limiting the size of the logical range to a single group and
 	 * setting the remain argument such that it describes the remaining
 	 * unmapped logical range. This is stricter than absolutely
 	 * necessary but helps simplify the logic below.
 	 */
 	uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
 	uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
 	if (logical_end > nextstart)
 		logical_end = nextstart;
 
 	/* Find the starting offset for each vdev in the group */
 	uint64_t perm, groupstart;
 	uint64_t start = vdev_draid_logical_to_physical(raidvd,
 	    logical_start, &perm, &groupstart);
 	uint64_t end = start;
 
 	uint8_t *base;
 	uint64_t iter, id;
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	/*
 	 * Check if the passed child falls within the group.  If it does
 	 * update the start and end to reflect the physical range.
 	 * Otherwise, leave them unmodified which will result in an empty
 	 * (zero-length) physical range being returned.
 	 */
 	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
 		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
 
 		if (c == 0 && i != 0) {
 			/* the group wrapped, increment the start */
 			start += VDEV_DRAID_ROWHEIGHT;
 			end = start;
 		}
 
 		id = vdev_draid_permute_id(vdc, base, iter, c);
 		if (id == cvd->vdev_id) {
 			uint64_t b_size = (logical_end >> ashift) -
 			    (logical_start >> ashift);
 			ASSERT3U(b_size, >, 0);
 			end = start + ((((b_size - 1) /
 			    vdc->vdc_groupwidth) + 1) << ashift);
 			break;
 		}
 	}
 	physical_rs->rs_start = start;
 	physical_rs->rs_end = end;
 
 	/*
 	 * Only top-level vdevs are allowed to set remain_rs because
 	 * when .vdev_op_xlate() is called for their children the full
 	 * logical range is not provided by vdev_xlate().
 	 */
 	remain_rs->rs_start = logical_end;
 	remain_rs->rs_end = logical_rs->rs_end;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_end - logical_start);
 }
 
 /*
  * Add dRAID specific fields to the config nvlist.
  */
 static void
 vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
 }
 
 /*
  * Initialize private dRAID specific fields from the nvlist.
  */
 static int
 vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	(void) spa;
 	uint64_t ndata, nparity, nspares, ngroups;
 	int error;
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
 	    nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	uint_t children;
 	nvlist_t **child;
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0 || children == 0 ||
 	    children > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
 	    nspares > 100 || nspares > (children - (ndata + nparity))) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
 	    ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Validate the minimum number of children exist per group for the
 	 * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
 	 */
 	if (children < (ndata + nparity + nspares))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Create the dRAID configuration using the pool nvlist configuration
 	 * and the fixed mapping for the correct number of children.
 	 */
 	vdev_draid_config_t *vdc;
 	const draid_map_t *map;
 
 	error = vdev_draid_lookup_map(children, &map);
 	if (error)
 		return (SET_ERROR(EINVAL));
 
 	vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
 	vdc->vdc_ndata = ndata;
 	vdc->vdc_nparity = nparity;
 	vdc->vdc_nspares = nspares;
 	vdc->vdc_children = children;
 	vdc->vdc_ngroups = ngroups;
 	vdc->vdc_nperms = map->dm_nperms;
 
 	error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
 	if (error) {
 		kmem_free(vdc, sizeof (*vdc));
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Derived constants.
 	 */
 	vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
 	vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
 	vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
 	vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
 	    vdc->vdc_ndisks;
 
 	ASSERT3U(vdc->vdc_groupwidth, >=, 2);
 	ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
 	ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
 	ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
 	ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
 	    vdc->vdc_ndisks, ==, 0);
 
 	*tsd = vdc;
 
 	return (0);
 }
 
 static void
 vdev_draid_fini(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
 	    vdc->vdc_children * vdc->vdc_nperms);
 	kmem_free(vdc, sizeof (*vdc));
 }
 
 static uint64_t
 vdev_draid_nparity(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_nparity);
 }
 
 static uint64_t
 vdev_draid_ndisks(vdev_t *vd)
 {
 	vdev_draid_config_t *vdc = vd->vdev_tsd;
 
 	return (vdc->vdc_ndisks);
 }
 
 vdev_ops_t vdev_draid_ops = {
 	.vdev_op_init = vdev_draid_init,
 	.vdev_op_fini = vdev_draid_fini,
 	.vdev_op_open = vdev_draid_open,
 	.vdev_op_close = vdev_draid_close,
-	.vdev_op_asize = vdev_draid_asize,
+	.vdev_op_psize_to_asize = vdev_draid_psize_to_asize,
+	.vdev_op_asize_to_psize = vdev_draid_asize_to_psize,
 	.vdev_op_min_asize = vdev_draid_min_asize,
 	.vdev_op_min_alloc = vdev_draid_min_alloc,
 	.vdev_op_io_start = vdev_draid_io_start,
 	.vdev_op_io_done = vdev_draid_io_done,
 	.vdev_op_state_change = vdev_draid_state_change,
 	.vdev_op_need_resilver = vdev_draid_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_draid_xlate,
 	.vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
 	.vdev_op_metaslab_init = vdev_draid_metaslab_init,
 	.vdev_op_config_generate = vdev_draid_config_generate,
 	.vdev_op_nparity = vdev_draid_nparity,
 	.vdev_op_ndisks = vdev_draid_ndisks,
 	.vdev_op_type = VDEV_TYPE_DRAID,
 	.vdev_op_leaf = B_FALSE,
 };
 
 
 /*
  * A dRAID distributed spare is a virtual leaf vdev which is included in the
  * parent dRAID configuration.  The last N columns of the dRAID permutation
  * table are used to determine on which dRAID children a specific offset
  * should be written.  These spare leaf vdevs can only be used to replace
  * faulted children in the same dRAID configuration.
  */
 
 /*
  * Distributed spare state.  All fields are set when the distributed spare is
  * first opened and are immutable.
  */
 typedef struct {
 	vdev_t *vds_draid_vdev;		/* top-level parent dRAID vdev */
 	uint64_t vds_top_guid;		/* top-level parent dRAID guid */
 	uint64_t vds_spare_id;		/* spare id (0 - vdc->vdc_nspares-1) */
 } vdev_draid_spare_t;
 
 /*
  * Returns the parent dRAID vdev to which the distributed spare belongs.
  * This may be safely called even when the vdev is not open.
  */
 vdev_t *
 vdev_draid_spare_get_parent(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	if (vds->vds_draid_vdev != NULL)
 		return (vds->vds_draid_vdev);
 
 	return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
 	    vds->vds_top_guid));
 }
 
 /*
  * A dRAID space is active when it's the child of a vdev using the
  * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
  */
 static boolean_t
 vdev_draid_spare_is_active(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
 	    pvd->vdev_ops == &vdev_replacing_ops ||
 	    pvd->vdev_ops == &vdev_draid_ops)) {
 		return (B_TRUE);
 	} else {
 		return (B_FALSE);
 	}
 }
 
 /*
  * Given a dRAID distribute spare vdev, returns the physical child vdev
  * on which the provided offset resides.  This may involve recursing through
  * multiple layers of distributed spares.  Note that offset is relative to
  * this vdev.
  */
 vdev_t *
 vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	/* The vdev is closed */
 	if (vds->vds_draid_vdev == NULL)
 		return (NULL);
 
 	vdev_t *tvd = vds->vds_draid_vdev;
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 
 	ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
 	ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
 
 	uint8_t *base;
 	uint64_t iter;
 	uint64_t perm = physical_offset / vdc->vdc_devslicesz;
 
 	vdev_draid_get_perm(vdc, perm, &base, &iter);
 
 	uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
 	    (tvd->vdev_children - 1) - vds->vds_spare_id);
 	vdev_t *cvd = tvd->vdev_child[cid];
 
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return (vdev_draid_spare_get_child(cvd, physical_offset));
 
 	return (cvd);
 }
 
 static void
 vdev_draid_spare_close(vdev_t *vd)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vds->vds_draid_vdev = NULL;
 }
 
 /*
  * Opening a dRAID spare device is done by looking up the associated dRAID
  * top-level vdev guid from the spare configuration.
  */
 static int
 vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
 	uint64_t asize, max_asize;
 
 	vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
 	if (tvd == NULL) {
 		/*
 		 * When spa_vdev_add() is labeling new spares the
 		 * associated dRAID is not attached to the root vdev
 		 * nor does this spare have a parent.  Simulate a valid
 		 * device in order to allow the label to be initialized
 		 * and the distributed spare added to the configuration.
 		 */
 		if (vd->vdev_parent == NULL) {
 			*psize = *max_psize = SPA_MINDEVSIZE;
 			*logical_ashift = *physical_ashift = ASHIFT_MIN;
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_draid_config_t *vdc = tvd->vdev_tsd;
 	if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (vds->vds_spare_id >= vdc->vdc_nspares)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
 	 * because the caller may be vdev_draid_open() in which case the
 	 * values are stale as they haven't yet been updated by vdev_open().
 	 * To avoid this always recalculate the dRAID asize and max_asize.
 	 */
 	vdev_draid_calculate_asize(tvd, &asize, &max_asize,
 	    logical_ashift, physical_ashift);
 
 	*psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 
 	vds->vds_draid_vdev = tvd;
 
 	return (0);
 }
 
 /*
  * Completed distributed spare IO.  Store the result in the parent zio
  * as if it had performed the operation itself.  Only the first error is
  * preserved if there are multiple errors.
  */
 static void
 vdev_draid_spare_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	/*
 	 * IOs are issued to non-writable vdevs in order to keep their
 	 * DTLs accurate.  However, we don't want to propagate the
 	 * error in to the distributed spare's DTL.  When resilvering
 	 * vdev_draid_need_resilver() will consult the relevant DTL
 	 * to determine if the data is missing and must be repaired.
 	 */
 	if (!vdev_writeable(zio->io_vd))
 		return;
 
 	if (pio->io_error == 0)
 		pio->io_error = zio->io_error;
 }
 
 /*
  * Returns a valid label nvlist for the distributed spare vdev.  This is
  * used to bypass the IO pipeline to avoid the complexity of constructing
  * a complete label with valid checksum to return when read.
  */
 nvlist_t *
 vdev_draid_read_config_spare(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	uint64_t guid = vd->vdev_guid;
 
 	nvlist_t *nv = fnvlist_alloc();
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
 	    vdev_draid_spare_is_active(vd) ?
 	    POOL_STATE_ACTIVE : POOL_STATE_SPARE);
 
 	/* Set the vdev guid based on the vdev list in sav_count. */
 	for (int i = 0; i < sav->sav_count; i++) {
 		if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
 		    strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
 			guid = sav->sav_vdevs[i]->vdev_guid;
 			break;
 		}
 	}
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
 
 	return (nv);
 }
 
 /*
  * Handle any flush requested of the distributed spare. All children must be
  * flushed.
  */
 static int
 vdev_draid_spare_flush(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	int error = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[c], zio->io_offset, zio->io_abd,
 		    zio->io_size, zio->io_type, zio->io_priority, 0,
 		    vdev_draid_spare_child_done, zio));
 	}
 
 	return (error);
 }
 
 /*
  * Initiate an IO to the distributed spare.  For normal IOs this entails using
  * the zio->io_offset and permutation table to calculate which child dRAID vdev
  * is responsible for the data.  Then passing along the zio to that child to
  * perform the actual IO.  The label ranges are not stored on disk and require
  * some special handling which is described below.
  */
 static void
 vdev_draid_spare_io_start(zio_t *zio)
 {
 	vdev_t *cvd = NULL, *vd = zio->io_vd;
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 	uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
 	 * Nothing to be done here but return failure.
 	 */
 	if (vds == NULL) {
 		zio->io_error = ENXIO;
 		zio_interrupt(zio);
 		return;
 	}
 
 	switch (zio->io_type) {
 	case ZIO_TYPE_FLUSH:
 		zio->io_error = vdev_draid_spare_flush(zio);
 		break;
 
 	case ZIO_TYPE_WRITE:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs and config writers to simulate the
 			 * existence of an on disk label.  vdev_label_sync(),
 			 * vdev_uberblock_sync() and vdev_copy_uberblocks()
 			 * skip the distributed spares.  This only leaves
 			 * vdev_label_init() which is allowed to succeed to
 			 * avoid adding special cases the function.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE ||
 			    zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_READ:
 		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
 			/*
 			 * Accept probe IOs to simulate the existence of a
 			 * label.  vdev_label_read_config() bypasses the
 			 * pipeline to read the label configuration and
 			 * vdev_uberblock_load() skips distributed spares
 			 * when attempting to locate the best uberblock.
 			 */
 			if (zio->io_flags & ZIO_FLAG_PROBE) {
 				zio->io_error = 0;
 			} else {
 				zio->io_error = SET_ERROR(EIO);
 			}
 		} else {
 			cvd = vdev_draid_spare_get_child(vd, offset);
 
 			if (cvd == NULL || !vdev_readable(cvd)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 				    offset, zio->io_abd, zio->io_size,
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_draid_spare_child_done, zio));
 			}
 		}
 		break;
 
 	case ZIO_TYPE_TRIM:
 		/* The vdev label ranges are never trimmed */
 		ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
 
 		cvd = vdev_draid_spare_get_child(vd, offset);
 
 		if (cvd == NULL || !cvd->vdev_has_trim) {
 			zio->io_error = SET_ERROR(ENXIO);
 		} else {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    offset, zio->io_abd, zio->io_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_draid_spare_child_done, zio));
 		}
 		break;
 
 	default:
 		zio->io_error = SET_ERROR(ENOTSUP);
 		break;
 	}
 
 	zio_execute(zio);
 }
 
 static void
 vdev_draid_spare_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 /*
  * Lookup the full spare config in spa->spa_spares.sav_config and
  * return the top_guid and spare_id for the named spare.
  */
 static int
 vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
     uint64_t *spare_idp)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int error;
 
 	if ((spa->spa_spares.sav_config == NULL) ||
 	    (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	const char *spare_name;
 	error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (int i = 0; i < nspares; i++) {
 		nvlist_t *spare = spares[i];
 		uint64_t top_guid, spare_id;
 		const char *type, *path;
 
 		/* Skip non-distributed spares */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
 		if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
 			continue;
 
 		/* Skip spares with the wrong name */
 		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
 		if (error != 0 || strcmp(path, spare_name) != 0)
 			continue;
 
 		/* Found the matching spare */
 		error = nvlist_lookup_uint64(spare,
 		    ZPOOL_CONFIG_TOP_GUID, &top_guid);
 		if (error == 0) {
 			error = nvlist_lookup_uint64(spare,
 			    ZPOOL_CONFIG_SPARE_ID, &spare_id);
 		}
 
 		if (error != 0) {
 			return (SET_ERROR(EINVAL));
 		} else {
 			*top_guidp = top_guid;
 			*spare_idp = spare_id;
 			return (0);
 		}
 	}
 
 	return (SET_ERROR(ENOENT));
 }
 
 /*
  * Initialize private dRAID spare specific fields from the nvlist.
  */
 static int
 vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	vdev_draid_spare_t *vds;
 	uint64_t top_guid = 0;
 	uint64_t spare_id;
 
 	/*
 	 * In the normal case check the list of spares stored in the spa
 	 * to lookup the top_guid and spare_id for provided spare config.
 	 * When creating a new pool or adding vdevs the spare list is not
 	 * yet populated and the values are provided in the passed config.
 	 */
 	if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
 		    &spare_id) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
 	vds->vds_draid_vdev = NULL;
 	vds->vds_top_guid = top_guid;
 	vds->vds_spare_id = spare_id;
 
 	*tsd = vds;
 
 	return (0);
 }
 
 static void
 vdev_draid_spare_fini(vdev_t *vd)
 {
 	kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
 }
 
 static void
 vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	vdev_draid_spare_t *vds = vd->vdev_tsd;
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
 
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
 }
 
 vdev_ops_t vdev_draid_spare_ops = {
 	.vdev_op_init = vdev_draid_spare_init,
 	.vdev_op_fini = vdev_draid_spare_fini,
 	.vdev_op_open = vdev_draid_spare_open,
 	.vdev_op_close = vdev_draid_spare_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_draid_spare_io_start,
 	.vdev_op_io_done = vdev_draid_spare_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_draid_spare_config_generate,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DRAID_SPARE,
 	.vdev_op_leaf = B_TRUE,
 };
diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c
index a2cb6f9b9ef9..f457669bc809 100644
--- a/module/zfs/vdev_file.c
+++ b/module/zfs/vdev_file.c
@@ -1,371 +1,372 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2025, Klara, Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
 #include <sys/stat.h>
 
 /*
  * Virtual device vector for files.
  */
 
 static taskq_t *vdev_file_taskq;
 
 /*
  * By default, the logical/physical ashift for file vdevs is set to
  * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
  * blocksizes. Users may opt to change one or both of these for testing
  * or performance reasons. Care should be taken as these values will
  * impact the vdev_ashift setting which can only be set at vdev creation
  * time.
  */
 static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
 static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
 
 void
 vdev_file_init(void)
 {
 	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
 	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
 
 	VERIFY(vdev_file_taskq);
 }
 
 void
 vdev_file_fini(void)
 {
 	taskq_destroy(vdev_file_taskq);
 }
 
 static void
 vdev_file_hold(vdev_t *vd)
 {
 	ASSERT3P(vd->vdev_path, !=, NULL);
 }
 
 static void
 vdev_file_rele(vdev_t *vd)
 {
 	ASSERT3P(vd->vdev_path, !=, NULL);
 }
 
 static mode_t
 vdev_file_open_mode(spa_mode_t spa_mode)
 {
 	mode_t mode = 0;
 
 	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
 		mode = O_RDWR;
 	} else if (spa_mode & SPA_MODE_READ) {
 		mode = O_RDONLY;
 	} else if (spa_mode & SPA_MODE_WRITE) {
 		mode = O_WRONLY;
 	}
 
 	return (mode | O_LARGEFILE);
 }
 
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_file_t *vf;
 	zfs_file_t *fp;
 	zfs_file_attr_t zfa;
 	int error;
 
 	/*
 	 * Rotational optimizations only make sense on block devices.
 	 */
 	vd->vdev_nonrot = B_TRUE;
 
 	/*
 	 * Allow TRIM on file based vdevs.  This may not always be supported,
 	 * since it depends on your kernel version and underlying filesystem
 	 * type but it is always safe to attempt.
 	 */
 	vd->vdev_has_trim = B_TRUE;
 
 	/*
 	 * Disable secure TRIM on file based vdevs.  There is no way to
 	 * request this behavior from the underlying filesystem.
 	 */
 	vd->vdev_has_securetrim = B_FALSE;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it's not currently open.  Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		vf = vd->vdev_tsd;
 		goto skip_open;
 	}
 
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
 	/*
 	 * We always open the files from the root of the global zone, even if
 	 * we're in a local zone.  If the user has gotten to this point, the
 	 * administrator has already decided that the pool should be available
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT3P(vd->vdev_path, !=, NULL);
 	ASSERT3S(vd->vdev_path[0], ==, '/');
 
 	error = zfs_file_open(vd->vdev_path,
 	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	vf->vf_file = fp;
 
 #ifdef _KERNEL
 	/*
 	 * Make sure it's a regular file.
 	 */
 	if (zfs_file_getattr(fp, &zfa)) {
 		return (SET_ERROR(ENODEV));
 	}
 	if (!S_ISREG(zfa.zfa_mode)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (SET_ERROR(ENODEV));
 	}
 #endif
 
 skip_open:
 
 	error =  zfs_file_getattr(vf->vf_file, &zfa);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	*max_psize = *psize = zfa.zfa_size;
 	*logical_ashift = vdev_file_logical_ashift;
 	*physical_ashift = vdev_file_physical_ashift;
 
 	return (0);
 }
 
 static void
 vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (vd->vdev_reopening || vf == NULL)
 		return;
 
 	if (vf->vf_file != NULL) {
 		(void) zfs_file_close(vf->vf_file);
 	}
 
 	vd->vdev_delayed_close = B_FALSE;
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
 static void
 vdev_file_io_strategy(void *arg)
 {
 	zio_t *zio = (zio_t *)arg;
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 	void *buf;
 	ssize_t resid;
 	loff_t off;
 	ssize_t size;
 	int err;
 
 	off = zio->io_offset;
 	size = zio->io_size;
 	resid = 0;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 	if (zio->io_type == ZIO_TYPE_READ) {
 		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
 		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf_copy(zio->io_abd, buf, size);
 	} else {
 		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
 		abd_return_buf(zio->io_abd, buf, size);
 	}
 	zio->io_error = err;
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = SET_ERROR(ENOSPC);
 
 	zio_delay_interrupt(zio);
 }
 
 static void
 vdev_file_io_fsync(void *arg)
 {
 	zio_t *zio = (zio_t *)arg;
 	vdev_file_t *vf = zio->io_vd->vdev_tsd;
 
 	zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
 
 	zio_interrupt(zio);
 }
 
 static void
 vdev_file_io_deallocate(void *arg)
 {
 	zio_t *zio = (zio_t *)arg;
 	vdev_file_t *vf = zio->io_vd->vdev_tsd;
 
 	zio->io_error = zfs_file_deallocate(vf->vf_file,
 	    zio->io_offset, zio->io_size);
 
 	zio_interrupt(zio);
 }
 
 static void
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio->io_type == ZIO_TYPE_FLUSH) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
 		if (zfs_nocacheflush) {
 			zio_interrupt(zio);
 			return;
 		}
 
 		VERIFY3U(taskq_dispatch(vdev_file_taskq,
 		    vdev_file_io_fsync, zio, TQ_SLEEP), !=, TASKQID_INVALID);
 
 		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_TRIM) {
 		ASSERT3U(zio->io_size, !=, 0);
 
 		VERIFY3U(taskq_dispatch(vdev_file_taskq,
 		    vdev_file_io_deallocate, zio, TQ_SLEEP), !=,
 		    TASKQID_INVALID);
 
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
 	    TQ_SLEEP), !=, TASKQID_INVALID);
 }
 
 static void
 vdev_file_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 vdev_ops_t vdev_file_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 /*
  * From userland we access disks just like files.
  */
 #ifndef _KERNEL
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 #endif
 
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW,
 	"Logical ashift for file-based devices");
 ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW,
 	"Physical ashift for file-based devices");
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index 30d7340f7f4b..b58b87d1fcc7 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -1,1924 +1,1925 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/metaslab.h>
 #include <sys/dmu.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zap.h>
 #include <sys/abd.h>
 #include <sys/zthr.h>
 #include <sys/fm/fs/zfs.h>
 
 /*
  * An indirect vdev corresponds to a vdev that has been removed.  Since
  * we cannot rewrite block pointers of snapshots, etc., we keep a
  * mapping from old location on the removed device to the new location
  * on another device in the pool and use this mapping whenever we need
  * to access the DVA.  Unfortunately, this mapping did not respect
  * logical block boundaries when it was first created, and so a DVA on
  * this indirect vdev may be "split" into multiple sections that each
  * map to a different location.  As a consequence, not all DVAs can be
  * translated to an equivalent new DVA.  Instead we must provide a
  * "vdev_remap" operation that executes a callback on each contiguous
  * segment of the new location.  This function is used in multiple ways:
  *
  *  - I/Os to this vdev use the callback to determine where the
  *    data is now located, and issue child I/Os for each segment's new
  *    location.
  *
  *  - frees and claims to this vdev use the callback to free or claim
  *    each mapped segment.  (Note that we don't actually need to claim
  *    log blocks on indirect vdevs, because we don't allocate to
  *    removing vdevs.  However, zdb uses zio_claim() for its leak
  *    detection.)
  */
 
 /*
  * "Big theory statement" for how we mark blocks obsolete.
  *
  * When a block on an indirect vdev is freed or remapped, a section of
  * that vdev's mapping may no longer be referenced (aka "obsolete").  We
  * keep track of how much of each mapping entry is obsolete.  When
  * an entry becomes completely obsolete, we can remove it, thus reducing
  * the memory used by the mapping.  The complete picture of obsolescence
  * is given by the following data structures, described below:
  *  - the entry-specific obsolete count
  *  - the vdev-specific obsolete spacemap
  *  - the pool-specific obsolete bpobj
  *
  * == On disk data structures used ==
  *
  * We track the obsolete space for the pool using several objects.  Each
  * of these objects is created on demand and freed when no longer
  * needed, and is assumed to be empty if it does not exist.
  * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
  *
  *  - Each vic_mapping_object (associated with an indirect vdev) can
  *    have a vimp_counts_object.  This is an array of uint32_t's
  *    with the same number of entries as the vic_mapping_object.  When
  *    the mapping is condensed, entries from the vic_obsolete_sm_object
  *    (see below) are folded into the counts.  Therefore, each
  *    obsolete_counts entry tells us the number of bytes in the
  *    corresponding mapping entry that were not referenced when the
  *    mapping was last condensed.
  *
  *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
  *    This is a space map containing an alloc entry for every DVA that
  *    has been obsoleted since the last time this indirect vdev was
  *    condensed.  We use this object in order to improve performance
  *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
  *    offset of the vimp_counts_object, we only need to append an entry
  *    to the end of this object.  When a DVA becomes obsolete, it is
  *    added to the obsolete space map.  This happens when the DVA is
  *    freed, remapped and not referenced by a snapshot, or the last
  *    snapshot referencing it is destroyed.
  *
  *  - Each dataset can have a ds_remap_deadlist object.  This is a
  *    deadlist object containing all blocks that were remapped in this
  *    dataset but referenced in a previous snapshot.  Blocks can *only*
  *    appear on this list if they were remapped (dsl_dataset_block_remapped);
  *    blocks that were killed in a head dataset are put on the normal
  *    ds_deadlist and marked obsolete when they are freed.
  *
  *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
  *    in the pool that need to be marked obsolete.  When a snapshot is
  *    destroyed, we move some of the ds_remap_deadlist to the obsolete
  *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
  *    asynchronously process the obsolete bpobj, moving its entries to
  *    the specific vdevs' obsolete space maps.
  *
  * == Summary of how we mark blocks as obsolete ==
  *
  * - When freeing a block: if any DVA is on an indirect vdev, append to
  *   vic_obsolete_sm_object.
  * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
  *   references; otherwise append to vic_obsolete_sm_object).
  * - When freeing a snapshot: move parts of ds_remap_deadlist to
  *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
  * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
  *   individual vdev's vic_obsolete_sm_object.
  */
 
 /*
  * "Big theory statement" for how we condense indirect vdevs.
  *
  * Condensing an indirect vdev's mapping is the process of determining
  * the precise counts of obsolete space for each mapping entry (by
  * integrating the obsolete spacemap into the obsolete counts) and
  * writing out a new mapping that contains only referenced entries.
  *
  * We condense a vdev when we expect the mapping to shrink (see
  * vdev_indirect_should_condense()), but only perform one condense at a
  * time to limit the memory usage.  In addition, we use a separate
  * open-context thread (spa_condense_indirect_thread) to incrementally
  * create the new mapping object in a way that minimizes the impact on
  * the rest of the system.
  *
  * == Generating a new mapping ==
  *
  * To generate a new mapping, we follow these steps:
  *
  * 1. Save the old obsolete space map and create a new mapping object
  *    (see spa_condense_indirect_start_sync()).  This initializes the
  *    spa_condensing_indirect_phys with the "previous obsolete space map",
  *    which is now read only.  Newly obsolete DVAs will be added to a
  *    new (initially empty) obsolete space map, and will not be
  *    considered as part of this condense operation.
  *
  * 2. Construct in memory the precise counts of obsolete space for each
  *    mapping entry, by incorporating the obsolete space map into the
  *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
  *
  * 3. Iterate through each mapping entry, writing to the new mapping any
  *    entries that are not completely obsolete (i.e. which don't have
  *    obsolete count == mapping length).  (See
  *    spa_condense_indirect_generate_new_mapping().)
  *
  * 4. Destroy the old mapping object and switch over to the new one
  *    (spa_condense_indirect_complete_sync).
  *
  * == Restarting from failure ==
  *
  * To restart the condense when we import/open the pool, we must start
  * at the 2nd step above: reconstruct the precise counts in memory,
  * based on the space map + counts.  Then in the 3rd step, we start
  * iterating where we left off: at vimp_max_offset of the new mapping
  * object.
  */
 
 static int zfs_condense_indirect_vdevs_enable = B_TRUE;
 
 /*
  * Condense if at least this percent of the bytes in the mapping is
  * obsolete.  With the default of 25%, the amount of space mapped
  * will be reduced to 1% of its original size after at most 16
  * condenses.  Higher values will condense less often (causing less
  * i/o); lower values will reduce the mapping size more quickly.
  */
 static uint_t zfs_condense_indirect_obsolete_pct = 25;
 
 /*
  * Condense if the obsolete space map takes up more than this amount of
  * space on disk (logically).  This limits the amount of disk space
  * consumed by the obsolete space map; the default of 1GB is small enough
  * that we typically don't mind "wasting" it.
  */
 static uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
 
 /*
  * Don't bother condensing if the mapping uses less than this amount of
  * memory.  The default of 128KB is considered a "trivial" amount of
  * memory and not worth reducing.
  */
 static uint64_t zfs_condense_min_mapping_bytes = 128 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a condense (which might otherwise
  * complete too quickly).  If used to reduce the performance impact of
  * condensing in production, a maximum value of 1 should be sufficient.
  */
 static uint_t zfs_condense_indirect_commit_entry_delay_ms = 0;
 
 /*
  * If an indirect split block contains more than this many possible unique
  * combinations when being reconstructed, consider it too computationally
  * expensive to check them all. Instead, try at most 100 randomly-selected
  * combinations each time the block is accessed.  This allows all segment
  * copies to participate fairly in the reconstruction when all combinations
  * cannot be checked and prevents repeated use of one bad copy.
  */
 uint_t zfs_reconstruct_indirect_combinations_max = 4096;
 
 /*
  * Enable to simulate damaged segments and validate reconstruction.  This
  * is intentionally not exposed as a module parameter.
  */
 unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
 
 /*
  * The indirect_child_t represents the vdev that we will read from, when we
  * need to read all copies of the data (e.g. for scrub or reconstruction).
  * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
  * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
  * ic_vdev is a child of the mirror.
  */
 typedef struct indirect_child {
 	abd_t *ic_data;
 	vdev_t *ic_vdev;
 
 	/*
 	 * ic_duplicate is NULL when the ic_data contents are unique, when it
 	 * is determined to be a duplicate it references the primary child.
 	 */
 	struct indirect_child *ic_duplicate;
 	list_node_t ic_node; /* node on is_unique_child */
 	int ic_error; /* set when a child does not contain the data */
 } indirect_child_t;
 
 /*
  * The indirect_split_t represents one mapped segment of an i/o to the
  * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
  * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
  * For split blocks, there will be several of these.
  */
 typedef struct indirect_split {
 	list_node_t is_node; /* link on iv_splits */
 
 	/*
 	 * is_split_offset is the offset into the i/o.
 	 * This is the sum of the previous splits' is_size's.
 	 */
 	uint64_t is_split_offset;
 
 	vdev_t *is_vdev; /* top-level vdev */
 	uint64_t is_target_offset; /* offset on is_vdev */
 	uint64_t is_size;
 	int is_children; /* number of entries in is_child[] */
 	int is_unique_children; /* number of entries in is_unique_child */
 	list_t is_unique_child;
 
 	/*
 	 * is_good_child is the child that we are currently using to
 	 * attempt reconstruction.
 	 */
 	indirect_child_t *is_good_child;
 
 	indirect_child_t is_child[];
 } indirect_split_t;
 
 /*
  * The indirect_vsd_t is associated with each i/o to the indirect vdev.
  * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
  */
 typedef struct indirect_vsd {
 	boolean_t iv_split_block;
 	boolean_t iv_reconstruct;
 	uint64_t iv_unique_combinations;
 	uint64_t iv_attempts;
 	uint64_t iv_attempts_max;
 
 	list_t iv_splits; /* list of indirect_split_t's */
 } indirect_vsd_t;
 
 static void
 vdev_indirect_map_free(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	indirect_split_t *is;
 	while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 			if (ic->ic_data != NULL)
 				abd_free(ic->ic_data);
 		}
 
 		indirect_child_t *ic;
 		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
 			;
 
 		list_destroy(&is->is_unique_child);
 
 		kmem_free(is,
 		    offsetof(indirect_split_t, is_child[is->is_children]));
 	}
 	kmem_free(iv, sizeof (*iv));
 }
 
 static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
 	.vsd_free = vdev_indirect_map_free,
 };
 
 /*
  * Mark the given offset and size as being obsolete.
  */
 void
 vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(size > 0);
 	VERIFY(vdev_indirect_mapping_entry_for_offset(
 	    vd->vdev_indirect_mapping, offset) != NULL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		mutex_enter(&vd->vdev_obsolete_lock);
 		zfs_range_tree_add(vd->vdev_obsolete_segments, offset, size);
 		mutex_exit(&vd->vdev_obsolete_lock);
 		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
 	}
 }
 
 /*
  * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
  * wrapper is provided because the DMU does not know about vdev_t's and
  * cannot directly call vdev_indirect_mark_obsolete.
  */
 void
 spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	/* The DMU can only remap indirect vdevs. */
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	vdev_indirect_mark_obsolete(vd, offset, size);
 }
 
 static spa_condensing_indirect_t *
 spa_condensing_indirect_create(spa_t *spa)
 {
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
 	objset_t *mos = spa->spa_meta_objset;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		list_create(&sci->sci_new_mapping_entries[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
 	}
 
 	sci->sci_new_mapping =
 	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
 
 	return (sci);
 }
 
 static void
 spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
 {
 	for (int i = 0; i < TXG_SIZE; i++)
 		list_destroy(&sci->sci_new_mapping_entries[i]);
 
 	if (sci->sci_new_mapping != NULL)
 		vdev_indirect_mapping_close(sci->sci_new_mapping);
 
 	kmem_free(sci, sizeof (*sci));
 }
 
 boolean_t
 vdev_indirect_should_condense(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
 
 	if (!zfs_condense_indirect_vdevs_enable)
 		return (B_FALSE);
 
 	/*
 	 * We can only condense one indirect vdev at a time.
 	 */
 	if (spa->spa_condensing_indirect != NULL)
 		return (B_FALSE);
 
 	if (spa_shutting_down(spa))
 		return (B_FALSE);
 
 	/*
 	 * The mapping object size must not change while we are
 	 * condensing, so we can only condense indirect vdevs
 	 * (not vdevs that are still in the middle of being removed).
 	 */
 	if (vd->vdev_ops != &vdev_indirect_ops)
 		return (B_FALSE);
 
 	/*
 	 * If nothing new has been marked obsolete, there is no
 	 * point in condensing.
 	 */
 	uint64_t obsolete_sm_obj __maybe_unused;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
 	if (vd->vdev_obsolete_sm == NULL) {
 		ASSERT0(obsolete_sm_obj);
 		return (B_FALSE);
 	}
 
 	ASSERT(vd->vdev_obsolete_sm != NULL);
 
 	ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm));
 
 	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
 	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
 	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
 	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
 
 	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
 
 	/*
 	 * If a high percentage of the bytes that are mapped have become
 	 * obsolete, condense (unless the mapping is already small enough).
 	 * This has a good chance of reducing the amount of memory used
 	 * by the mapping.
 	 */
 	if (bytes_obsolete * 100 / bytes_mapped >=
 	    zfs_condense_indirect_obsolete_pct &&
 	    mapping_size > zfs_condense_min_mapping_bytes) {
 		zfs_dbgmsg("should condense vdev %llu because obsolete "
 		    "spacemap covers %d%% of %lluMB mapping",
 		    (u_longlong_t)vd->vdev_id,
 		    (int)(bytes_obsolete * 100 / bytes_mapped),
 		    (u_longlong_t)bytes_mapped / 1024 / 1024);
 		return (B_TRUE);
 	}
 
 	/*
 	 * If the obsolete space map takes up too much space on disk,
 	 * condense in order to free up this disk space.
 	 */
 	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
 		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
 		    "length %lluMB >= max size %lluMB",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
 		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
 		    1024 / 1024);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * This sync task completes (finishes) a condense, deleting the old
  * mapping and replacing it with the new one.
  */
 static void
 spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_condensing_indirect_t *sci = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
 	uint64_t new_count =
 	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
 	}
 	ASSERT(vic->vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
 	ASSERT(scip->scip_next_mapping_object != 0);
 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 
 	/*
 	 * Reset vdev_indirect_mapping to refer to the new object.
 	 */
 	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 	vd->vdev_indirect_mapping = sci->sci_new_mapping;
 	rw_exit(&vd->vdev_indirect_rwlock);
 
 	sci->sci_new_mapping = NULL;
 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
 	vic->vic_mapping_object = scip->scip_next_mapping_object;
 	scip->scip_next_mapping_object = 0;
 
 	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
 	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	scip->scip_prev_obsolete_sm_object = 0;
 
 	scip->scip_vdev = 0;
 
 	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, tx));
 	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
 	spa->spa_condensing_indirect = NULL;
 
 	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
 	    "new mapping object %llu has %llu entries "
 	    "(was %llu entries)",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)vic->vic_mapping_object,
 	    (u_longlong_t)new_count, (u_longlong_t)old_count);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 /*
  * This sync task appends entries to the new mapping object.
  */
 static void
 spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_condensing_indirect_t *sci = arg;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
 
 	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
 	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
 	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
 }
 
 /*
  * Open-context function to add one entry to the new mapping.  The new
  * entry will be remembered and written from syncing context.
  */
 static void
 spa_condense_indirect_commit_entry(spa_t *spa,
     vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
 {
 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
 
 	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	/*
 	 * If we are the first entry committed this txg, kick off the sync
 	 * task to write to the MOS on our behalf.
 	 */
 	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx),
 		    spa_condense_indirect_commit_sync, sci, tx);
 	}
 
 	vdev_indirect_mapping_entry_t *vime =
 	    kmem_alloc(sizeof (*vime), KM_SLEEP);
 	vime->vime_mapping = *vimep;
 	vime->vime_obsolete_count = count;
 	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
 
 	dmu_tx_commit(tx);
 }
 
 static void
 spa_condense_indirect_generate_new_mapping(vdev_t *vd,
     uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t mapi = start_index;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	uint64_t old_num_entries =
 	    vdev_indirect_mapping_num_entries(old_mapping);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
 
 	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
 	    (u_longlong_t)vd->vdev_id,
 	    (u_longlong_t)mapi);
 
 	while (mapi < old_num_entries) {
 
 		if (zthr_iscancelled(zthr)) {
 			zfs_dbgmsg("pausing condense of vdev %llu "
 			    "at index %llu", (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)mapi);
 			break;
 		}
 
 		vdev_indirect_mapping_entry_phys_t *entry =
 		    &old_mapping->vim_entries[mapi];
 		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
 		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
 		if (obsolete_counts[mapi] < entry_size) {
 			spa_condense_indirect_commit_entry(spa, entry,
 			    obsolete_counts[mapi]);
 
 			/*
 			 * This delay may be requested for testing, debugging,
 			 * or performance reasons.
 			 */
 			hrtime_t now = gethrtime();
 			hrtime_t sleep_until = now + MSEC2NSEC(
 			    zfs_condense_indirect_commit_entry_delay_ms);
 			zfs_sleep_until(sleep_until);
 		}
 
 		mapi++;
 	}
 }
 
 static boolean_t
 spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_condensing_indirect != NULL);
 }
 
 static void
 spa_condense_indirect_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_t *vd;
 
 	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint32_t *counts;
 	uint64_t start_index;
 	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
 	space_map_t *prev_obsolete_sm = NULL;
 
 	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
 	ASSERT(scip->scip_next_mapping_object != 0);
 	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * The list must start out empty in order for the
 		 * _commit_sync() sync task to be properly registered
 		 * on the first call to _commit_entry(); so it's wise
 		 * to double check and ensure we actually are starting
 		 * with empty lists.
 		 */
 		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
 	}
 
 	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
 	if (prev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
 		    counts, prev_obsolete_sm);
 	}
 	space_map_close(prev_obsolete_sm);
 
 	/*
 	 * Generate new mapping.  Determine what index to continue from
 	 * based on the max offset that we've already written in the
 	 * new mapping.
 	 */
 	uint64_t max_offset =
 	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
 	if (max_offset == 0) {
 		/* We haven't written anything to the new mapping yet. */
 		start_index = 0;
 	} else {
 		/*
 		 * Pick up from where we left off. _entry_for_offset()
 		 * returns a pointer into the vim_entries array. If
 		 * max_offset is greater than any of the mappings
 		 * contained in the table  NULL will be returned and
 		 * that indicates we've exhausted our iteration of the
 		 * old_mapping.
 		 */
 
 		vdev_indirect_mapping_entry_phys_t *entry =
 		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
 		    max_offset);
 
 		if (entry == NULL) {
 			/*
 			 * We've already written the whole new mapping.
 			 * This special value will cause us to skip the
 			 * generate_new_mapping step and just do the sync
 			 * task to complete the condense.
 			 */
 			start_index = UINT64_MAX;
 		} else {
 			start_index = entry - old_mapping->vim_entries;
 			ASSERT3U(start_index, <,
 			    vdev_indirect_mapping_num_entries(old_mapping));
 		}
 	}
 
 	spa_condense_indirect_generate_new_mapping(vd, counts,
 	    start_index, zthr);
 
 	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
 
 	/*
 	 * If the zthr has received a cancellation signal while running
 	 * in generate_new_mapping() or at any point after that, then bail
 	 * early. We don't want to complete the condense if the spa is
 	 * shutting down.
 	 */
 	if (zthr_iscancelled(zthr))
 		return;
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    spa_condense_indirect_complete_sync, sci, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 /*
  * Sync task to begin the condensing process.
  */
 void
 spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 
 	ASSERT0(scip->scip_next_mapping_object);
 	ASSERT0(scip->scip_prev_obsolete_sm_object);
 	ASSERT0(scip->scip_vdev);
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
 	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
 
 	uint64_t obsolete_sm_obj;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
 	ASSERT3U(obsolete_sm_obj, !=, 0);
 
 	scip->scip_vdev = vd->vdev_id;
 	scip->scip_next_mapping_object =
 	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
 
 	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
 
 	/*
 	 * We don't need to allocate a new space map object, since
 	 * vdev_indirect_sync_obsolete will allocate one when needed.
 	 */
 	space_map_close(vd->vdev_obsolete_sm);
 	vd->vdev_obsolete_sm = NULL;
 	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
 
 	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
 	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
 
 	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
 	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
 
 	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
 	    "posm=%llu nm=%llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
 	    (u_longlong_t)scip->scip_next_mapping_object);
 
 	zthr_wakeup(spa->spa_condense_zthr);
 }
 
 /*
  * Sync to the given vdev's obsolete space map any segments that are no longer
  * referenced as of the given txg.
  *
  * If the obsolete space map doesn't exist yet, create and open it.
  */
 void
 vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
 
 	ASSERT3U(vic->vic_mapping_object, !=, 0);
 	ASSERT(zfs_range_tree_space(vd->vdev_obsolete_segments) > 0);
 	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
 	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object == 0) {
 		obsolete_sm_object = space_map_alloc(spa->spa_meta_objset,
 		    zfs_vdev_standard_sm_blksz, tx);
 
 		ASSERT(vd->vdev_top_zap != 0);
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
 		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
 		ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		ASSERT3U(obsolete_sm_object, !=, 0);
 
 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
 		    spa->spa_meta_objset, obsolete_sm_object,
 		    0, vd->vdev_asize, 0));
 	}
 
 	ASSERT(vd->vdev_obsolete_sm != NULL);
 	ASSERT3U(obsolete_sm_object, ==,
 	    space_map_object(vd->vdev_obsolete_sm));
 
 	space_map_write(vd->vdev_obsolete_sm,
 	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
 	zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 }
 
 int
 spa_condense_init(spa_t *spa)
 {
 	int error = zap_lookup(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
 	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
 	    &spa->spa_condensing_indirect_phys);
 	if (error == 0) {
 		if (spa_writeable(spa)) {
 			spa->spa_condensing_indirect =
 			    spa_condensing_indirect_create(spa);
 		}
 		return (0);
 	} else if (error == ENOENT) {
 		return (0);
 	} else {
 		return (error);
 	}
 }
 
 void
 spa_condense_fini(spa_t *spa)
 {
 	if (spa->spa_condensing_indirect != NULL) {
 		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
 		spa->spa_condensing_indirect = NULL;
 	}
 }
 
 void
 spa_start_indirect_condensing_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
 	spa->spa_condense_zthr = zthr_create("z_indirect_condense",
 	    spa_condense_indirect_thread_check,
 	    spa_condense_indirect_thread, spa, minclsyspri);
 }
 
 /*
  * Gets the obsolete spacemap object from the vdev's ZAP.  On success sm_obj
  * will contain either the obsolete spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 /*
  * Gets the obsolete count are precise spacemap object from the vdev's ZAP.
  * On success are_precise will be set to reflect if the counts are precise.
  * All other errors are returned to the caller.
  */
 int
 vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*are_precise = B_FALSE;
 		return (0);
 	}
 
 	uint64_t val = 0;
 	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
 	if (error == 0) {
 		*are_precise = (val != 0);
 	} else if (error == ENOENT) {
 		*are_precise = B_FALSE;
 		error = 0;
 	}
 
 	return (error);
 }
 
 static void
 vdev_indirect_close(vdev_t *vd)
 {
 	(void) vd;
 }
 
 static int
 vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	*psize = *max_psize = vd->vdev_asize +
 	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	*logical_ashift = vd->vdev_ashift;
 	*physical_ashift = vd->vdev_physical_ashift;
 	return (0);
 }
 
 typedef struct remap_segment {
 	vdev_t *rs_vd;
 	uint64_t rs_offset;
 	uint64_t rs_asize;
 	uint64_t rs_split_offset;
 	list_node_t rs_node;
 } remap_segment_t;
 
 static remap_segment_t *
 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
 {
 	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
 	rs->rs_vd = vd;
 	rs->rs_offset = offset;
 	rs->rs_asize = asize;
 	rs->rs_split_offset = split_offset;
 	return (rs);
 }
 
 /*
  * Given an indirect vdev and an extent on that vdev, it duplicates the
  * physical entries of the indirect mapping that correspond to the extent
  * to a new array and returns a pointer to it. In addition, copied_entries
  * is populated with the number of mapping entries that were duplicated.
  *
  * Note that the function assumes that the caller holds vdev_indirect_rwlock.
  * This ensures that the mapping won't change due to condensing as we
  * copy over its contents.
  *
  * Finally, since we are doing an allocation, it is up to the caller to
  * free the array allocated in this function.
  */
 static vdev_indirect_mapping_entry_phys_t *
 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
     uint64_t asize, uint64_t *copied_entries)
 {
 	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t entries = 0;
 
 	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
 
 	vdev_indirect_mapping_entry_phys_t *first_mapping =
 	    vdev_indirect_mapping_entry_for_offset(vim, offset);
 	ASSERT3P(first_mapping, !=, NULL);
 
 	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
 	while (asize > 0) {
 		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 
 		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
 		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 
 		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
 		uint64_t inner_size = MIN(asize, size - inner_offset);
 
 		offset += inner_size;
 		asize -= inner_size;
 		entries++;
 		m++;
 	}
 
 	size_t copy_length = entries * sizeof (*first_mapping);
 	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
 	memcpy(duplicate_mappings, first_mapping, copy_length);
 	*copied_entries = entries;
 
 	return (duplicate_mappings);
 }
 
 /*
  * Goes through the relevant indirect mappings until it hits a concrete vdev
  * and issues the callback. On the way to the concrete vdev, if any other
  * indirect vdevs are encountered, then the callback will also be called on
  * each of those indirect vdevs. For example, if the segment is mapped to
  * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
  * mapped to segment B on concrete vdev 2, then the callback will be called on
  * both vdev 1 and vdev 2.
  *
  * While the callback passed to vdev_indirect_remap() is called on every vdev
  * the function encounters, certain callbacks only care about concrete vdevs.
  * These types of callbacks should return immediately and explicitly when they
  * are called on an indirect vdev.
  *
  * Because there is a possibility that a DVA section in the indirect device
  * has been split into multiple sections in our mapping, we keep track
  * of the relevant contiguous segments of the new location (remap_segment_t)
  * in a stack. This way we can call the callback for each of the new sections
  * created by a single section of the indirect device. Note though, that in
  * this scenario the callbacks in each split block won't occur in-order in
  * terms of offset, so callers should not make any assumptions about that.
  *
  * For callbacks that don't handle split blocks and immediately return when
  * they encounter them (as is the case for remap_blkptr_cb), the caller can
  * assume that its callback will be applied from the first indirect vdev
  * encountered to the last one and then the concrete vdev, in that order.
  */
 static void
 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
     void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
 {
 	list_t stack;
 	spa_t *spa = vd->vdev_spa;
 
 	list_create(&stack, sizeof (remap_segment_t),
 	    offsetof(remap_segment_t, rs_node));
 
 	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
 	    rs != NULL; rs = list_remove_head(&stack)) {
 		vdev_t *v = rs->rs_vd;
 		uint64_t num_entries = 0;
 
 		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 		ASSERT(rs->rs_asize > 0);
 
 		/*
 		 * Note: As this function can be called from open context
 		 * (e.g. zio_read()), we need the following rwlock to
 		 * prevent the mapping from being changed by condensing.
 		 *
 		 * So we grab the lock and we make a copy of the entries
 		 * that are relevant to the extent that we are working on.
 		 * Once that is done, we drop the lock and iterate over
 		 * our copy of the mapping. Once we are done with the with
 		 * the remap segment and we free it, we also free our copy
 		 * of the indirect mapping entries that are relevant to it.
 		 *
 		 * This way we don't need to wait until the function is
 		 * finished with a segment, to condense it. In addition, we
 		 * don't need a recursive rwlock for the case that a call to
 		 * vdev_indirect_remap() needs to call itself (through the
 		 * codepath of its callback) for the same vdev in the middle
 		 * of its execution.
 		 */
 		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
 		ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
 
 		vdev_indirect_mapping_entry_phys_t *mapping =
 		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
 		    rs->rs_offset, rs->rs_asize, &num_entries);
 		ASSERT3P(mapping, !=, NULL);
 		ASSERT3U(num_entries, >, 0);
 		rw_exit(&v->vdev_indirect_rwlock);
 
 		for (uint64_t i = 0; i < num_entries; i++) {
 			/*
 			 * Note: the vdev_indirect_mapping can not change
 			 * while we are running.  It only changes while the
 			 * removal is in progress, and then only from syncing
 			 * context. While a removal is in progress, this
 			 * function is only called for frees, which also only
 			 * happen from syncing context.
 			 */
 			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
 
 			ASSERT3P(m, !=, NULL);
 			ASSERT3U(rs->rs_asize, >, 0);
 
 			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
 			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
 			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
 
 			ASSERT3U(rs->rs_offset, >=,
 			    DVA_MAPPING_GET_SRC_OFFSET(m));
 			ASSERT3U(rs->rs_offset, <,
 			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
 			ASSERT3U(dst_vdev, !=, v->vdev_id);
 
 			uint64_t inner_offset = rs->rs_offset -
 			    DVA_MAPPING_GET_SRC_OFFSET(m);
 			uint64_t inner_size =
 			    MIN(rs->rs_asize, size - inner_offset);
 
 			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
 			ASSERT3P(dst_v, !=, NULL);
 
 			if (dst_v->vdev_ops == &vdev_indirect_ops) {
 				list_insert_head(&stack,
 				    rs_alloc(dst_v, dst_offset + inner_offset,
 				    inner_size, rs->rs_split_offset));
 
 			}
 
 			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
 			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
 				/*
 				 * Note: This clause exists only solely for
 				 * testing purposes. We use it to ensure that
 				 * split blocks work and that the callbacks
 				 * using them yield the same result if issued
 				 * in reverse order.
 				 */
 				uint64_t inner_half = inner_size / 2;
 
 				func(rs->rs_split_offset + inner_half, dst_v,
 				    dst_offset + inner_offset + inner_half,
 				    inner_half, arg);
 
 				func(rs->rs_split_offset, dst_v,
 				    dst_offset + inner_offset,
 				    inner_half, arg);
 			} else {
 				func(rs->rs_split_offset, dst_v,
 				    dst_offset + inner_offset,
 				    inner_size, arg);
 			}
 
 			rs->rs_offset += inner_size;
 			rs->rs_asize -= inner_size;
 			rs->rs_split_offset += inner_size;
 		}
 		VERIFY0(rs->rs_asize);
 
 		kmem_free(mapping, num_entries * sizeof (*mapping));
 		kmem_free(rs, sizeof (remap_segment_t));
 	}
 	list_destroy(&stack);
 }
 
 static void
 vdev_indirect_child_io_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 
 	abd_free(zio->io_abd);
 }
 
 /*
  * This is a callback for vdev_indirect_remap() which allocates an
  * indirect_split_t for each split segment and adds it to iv_splits.
  */
 static void
 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	zio_t *zio = arg;
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3P(vd, !=, NULL);
 
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	int n = 1;
 	if (vd->vdev_ops == &vdev_mirror_ops)
 		n = vd->vdev_children;
 
 	indirect_split_t *is =
 	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
 
 	is->is_children = n;
 	is->is_size = size;
 	is->is_split_offset = split_offset;
 	is->is_target_offset = offset;
 	is->is_vdev = vd;
 	list_create(&is->is_unique_child, sizeof (indirect_child_t),
 	    offsetof(indirect_child_t, ic_node));
 
 	/*
 	 * Note that we only consider multiple copies of the data for
 	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
 	 * though they use the same ops as mirror, because there's only one
 	 * "good" copy under the replacing/spare.
 	 */
 	if (vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < n; i++) {
 			is->is_child[i].ic_vdev = vd->vdev_child[i];
 			list_link_init(&is->is_child[i].ic_node);
 		}
 	} else {
 		is->is_child[0].ic_vdev = vd;
 	}
 
 	list_insert_tail(&iv->iv_splits, is);
 }
 
 static void
 vdev_indirect_read_split_done(zio_t *zio)
 {
 	indirect_child_t *ic = zio->io_private;
 
 	if (zio->io_error != 0) {
 		/*
 		 * Clear ic_data to indicate that we do not have data for this
 		 * child.
 		 */
 		abd_free(ic->ic_data);
 		ic->ic_data = NULL;
 	}
 }
 
 /*
  * Issue reads for all copies (mirror children) of all splits.
  */
 static void
 vdev_indirect_read_all(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic = &is->is_child[i];
 
 			if (!vdev_readable(ic->ic_vdev))
 				continue;
 
 			/*
 			 * If a child is missing the data, set ic_error. Used
 			 * in vdev_indirect_repair(). We perform the read
 			 * nevertheless which provides the opportunity to
 			 * reconstruct the split block if at all possible.
 			 */
 			if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
 			    zio->io_txg, 1))
 				ic->ic_error = SET_ERROR(ESTALE);
 
 			ic->ic_data = abd_alloc_sametype(zio->io_abd,
 			    is->is_size);
 			ic->ic_duplicate = NULL;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
 			    is->is_size, zio->io_type, zio->io_priority, 0,
 			    vdev_indirect_read_split_done, ic));
 		}
 	}
 	iv->iv_reconstruct = B_TRUE;
 }
 
 static void
 vdev_indirect_io_start(zio_t *zio)
 {
 	spa_t *spa __maybe_unused = zio->io_spa;
 	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
 	list_create(&iv->iv_splits,
 	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
 
 	zio->io_vsd = iv;
 	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 	if (zio->io_type != ZIO_TYPE_READ) {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 		/*
 		 * Note: this code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
 		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
 	}
 
 	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
 	    vdev_indirect_gather_splits, zio);
 
 	indirect_split_t *first = list_head(&iv->iv_splits);
 	ASSERT3P(first, !=, NULL);
 	if (first->is_size == zio->io_size) {
 		/*
 		 * This is not a split block; we are pointing to the entire
 		 * data, which will checksum the same as the original data.
 		 * Pass the BP down so that the child i/o can verify the
 		 * checksum, and try a different location if available
 		 * (e.g. on a mirror).
 		 *
 		 * While this special case could be handled the same as the
 		 * general (split block) case, doing it this way ensures
 		 * that the vast majority of blocks on indirect vdevs
 		 * (which are not split) are handled identically to blocks
 		 * on non-indirect vdevs.  This allows us to be less strict
 		 * about performance in the general (but rare) case.
 		 */
 		ASSERT0(first->is_split_offset);
 		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    first->is_vdev, first->is_target_offset,
 		    abd_get_offset(zio->io_abd, 0),
 		    zio->io_size, zio->io_type, zio->io_priority, 0,
 		    vdev_indirect_child_io_done, zio));
 	} else {
 		iv->iv_split_block = B_TRUE;
 		if (zio->io_type == ZIO_TYPE_READ &&
 		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
 			/*
 			 * Read all copies.  Note that for simplicity,
 			 * we don't bother consulting the DTL in the
 			 * resilver case.
 			 */
 			vdev_indirect_read_all(zio);
 		} else {
 			/*
 			 * If this is a read zio, we read one copy of each
 			 * split segment, from the top-level vdev.  Since
 			 * we don't know the checksum of each split
 			 * individually, the child zio can't ensure that
 			 * we get the right data. E.g. if it's a mirror,
 			 * it will just read from a random (healthy) leaf
 			 * vdev. We have to verify the checksum in
 			 * vdev_indirect_io_done().
 			 *
 			 * For write zios, the vdev code will ensure we write
 			 * to all children.
 			 */
 			for (indirect_split_t *is = list_head(&iv->iv_splits);
 			    is != NULL; is = list_next(&iv->iv_splits, is)) {
 				zio_nowait(zio_vdev_child_io(zio, NULL,
 				    is->is_vdev, is->is_target_offset,
 				    abd_get_offset_size(zio->io_abd,
 				    is->is_split_offset, is->is_size),
 				    is->is_size, zio->io_type,
 				    zio->io_priority, 0,
 				    vdev_indirect_child_io_done, zio));
 			}
 
 		}
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child.
  */
 static void
 vdev_indirect_checksum_error(zio_t *zio,
     indirect_split_t *is, indirect_child_t *ic)
 {
 	vdev_t *vd = ic->ic_vdev;
 
 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_checksum_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	zio_bad_cksum_t zbc = { 0 };
 	abd_t *bad_abd = ic->ic_data;
 	abd_t *good_abd = is->is_good_child->ic_data;
 	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
 	    is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
 }
 
 /*
  * Issue repair i/os for any incorrect copies.  We do this by comparing
  * each split segment's correct data (is_good_child's ic_data) with each
  * other copy of the data.  If they differ, then we overwrite the bad data
  * with the good copy.  The DTL is checked in vdev_indirect_read_all() and
  * if a vdev is missing a copy of the data we set ic_error and the read is
  * performed. This provides the opportunity to reconstruct the split block
  * if at all possible. ic_error is checked here and if set it suppresses
  * incrementing the checksum counter. Aside from this DTLs are not checked,
  * which simplifies this code and also issues the optimal number of writes
  * (based on which copies actually read bad data, as opposed to which we
  * think might be wrong).  For the same reason, we always use
  * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
  */
 static void
 vdev_indirect_repair(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (!spa_writeable(zio->io_spa))
 		return;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 			if (ic == is->is_good_child)
 				continue;
 			if (ic->ic_data == NULL)
 				continue;
 			if (ic->ic_duplicate == is->is_good_child)
 				continue;
 
 			zio_nowait(zio_vdev_child_io(zio, NULL,
 			    ic->ic_vdev, is->is_target_offset,
 			    is->is_good_child->ic_data, is->is_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL));
 
 			/*
 			 * If ic_error is set the current child does not have
 			 * a copy of the data, so suppress incrementing the
 			 * checksum counter.
 			 */
 			if (ic->ic_error == ESTALE)
 				continue;
 
 			vdev_indirect_checksum_error(zio, is, ic);
 		}
 	}
 }
 
 /*
  * Report checksum errors on all children that we read from.
  */
 static void
 vdev_indirect_all_checksum_errors(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 
 			if (ic->ic_data == NULL)
 				continue;
 
 			vdev_t *vd = ic->ic_vdev;
 
 			mutex_enter(&vd->vdev_stat_lock);
 			vd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&vd->vdev_stat_lock);
 			(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 			    NULL, zio, is->is_target_offset, is->is_size,
 			    NULL, NULL, NULL);
 		}
 	}
 }
 
 /*
  * Copy data from all the splits to a main zio then validate the checksum.
  * If then checksum is successfully validated return success.
  */
 static int
 vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
 {
 	zio_bad_cksum_t zbc;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 
 		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
 		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
 
 		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
 		    is->is_split_offset, 0, is->is_size);
 	}
 
 	return (zio_checksum_error(zio, &zbc));
 }
 
 /*
  * There are relatively few possible combinations making it feasible to
  * deterministically check them all.  We do this by setting the good_child
  * to the next unique split version.  If we reach the end of the list then
  * "carry over" to the next unique split version (like counting in base
  * is_unique_children, but each digit can have a different base).
  */
 static int
 vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
 {
 	boolean_t more = B_TRUE;
 
 	iv->iv_attempts = 0;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is))
 		is->is_good_child = list_head(&is->is_unique_child);
 
 	while (more == B_TRUE) {
 		iv->iv_attempts++;
 		more = B_FALSE;
 
 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
 			return (0);
 
 		for (indirect_split_t *is = list_head(&iv->iv_splits);
 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
 			is->is_good_child = list_next(&is->is_unique_child,
 			    is->is_good_child);
 			if (is->is_good_child != NULL) {
 				more = B_TRUE;
 				break;
 			}
 
 			is->is_good_child = list_head(&is->is_unique_child);
 		}
 	}
 
 	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
 
 	return (SET_ERROR(ECKSUM));
 }
 
 /*
  * There are too many combinations to try all of them in a reasonable amount
  * of time.  So try a fixed number of random combinations from the unique
  * split versions, after which we'll consider the block unrecoverable.
  */
 static int
 vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
 {
 	iv->iv_attempts = 0;
 
 	while (iv->iv_attempts < iv->iv_attempts_max) {
 		iv->iv_attempts++;
 
 		for (indirect_split_t *is = list_head(&iv->iv_splits);
 		    is != NULL; is = list_next(&iv->iv_splits, is)) {
 			indirect_child_t *ic = list_head(&is->is_unique_child);
 			int children = is->is_unique_children;
 
 			for (int i = random_in_range(children); i > 0; i--)
 				ic = list_next(&is->is_unique_child, ic);
 
 			ASSERT3P(ic, !=, NULL);
 			is->is_good_child = ic;
 		}
 
 		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
 			return (0);
 	}
 
 	return (SET_ERROR(ECKSUM));
 }
 
 /*
  * This is a validation function for reconstruction.  It randomly selects
  * a good combination, if one can be found, and then it intentionally
  * damages all other segment copes by zeroing them.  This forces the
  * reconstruction algorithm to locate the one remaining known good copy.
  */
 static int
 vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
 {
 	int error;
 
 	/* Presume all the copies are unique for initial selection. */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		is->is_unique_children = 0;
 
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic = &is->is_child[i];
 			if (ic->ic_data != NULL) {
 				is->is_unique_children++;
 				list_insert_tail(&is->is_unique_child, ic);
 			}
 		}
 
 		if (list_is_empty(&is->is_unique_child)) {
 			error = SET_ERROR(EIO);
 			goto out;
 		}
 	}
 
 	/*
 	 * Set each is_good_child to a randomly-selected child which
 	 * is known to contain validated data.
 	 */
 	error = vdev_indirect_splits_enumerate_randomly(iv, zio);
 	if (error)
 		goto out;
 
 	/*
 	 * Damage all but the known good copy by zeroing it.  This will
 	 * result in two or less unique copies per indirect_child_t.
 	 * Both may need to be checked in order to reconstruct the block.
 	 * Set iv->iv_attempts_max such that all unique combinations will
 	 * enumerated, but limit the damage to at most 12 indirect splits.
 	 */
 	iv->iv_attempts_max = 1;
 
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int c = 0; c < is->is_children; c++) {
 			indirect_child_t *ic = &is->is_child[c];
 
 			if (ic == is->is_good_child)
 				continue;
 			if (ic->ic_data == NULL)
 				continue;
 
 			abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
 		}
 
 		iv->iv_attempts_max *= 2;
 		if (iv->iv_attempts_max >= (1ULL << 12)) {
 			iv->iv_attempts_max = UINT64_MAX;
 			break;
 		}
 	}
 
 out:
 	/* Empty the unique children lists so they can be reconstructed. */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		indirect_child_t *ic;
 		while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
 			;
 
 		is->is_unique_children = 0;
 	}
 
 	return (error);
 }
 
 /*
  * This function is called when we have read all copies of the data and need
  * to try to find a combination of copies that gives us the right checksum.
  *
  * If we pointed to any mirror vdevs, this effectively does the job of the
  * mirror.  The mirror vdev code can't do its own job because we don't know
  * the checksum of each split segment individually.
  *
  * We have to try every unique combination of copies of split segments, until
  * we find one that checksums correctly.  Duplicate segment copies are first
  * identified and latter skipped during reconstruction.  This optimization
  * reduces the search space and ensures that of the remaining combinations
  * at most one is correct.
  *
  * When the total number of combinations is small they can all be checked.
  * For example, if we have 3 segments in the split, and each points to a
  * 2-way mirror with unique copies, we will have the following pieces of data:
  *
  *       |     mirror child
  * split |     [0]        [1]
  * ======|=====================
  *   A   |  data_A_0   data_A_1
  *   B   |  data_B_0   data_B_1
  *   C   |  data_C_0   data_C_1
  *
  * We will try the following (mirror children)^(number of splits) (2^3=8)
  * combinations, which is similar to bitwise-little-endian counting in
  * binary.  In general each "digit" corresponds to a split segment, and the
  * base of each digit is is_children, which can be different for each
  * digit.
  *
  * "low bit"        "high bit"
  *        v                 v
  * data_A_0 data_B_0 data_C_0
  * data_A_1 data_B_0 data_C_0
  * data_A_0 data_B_1 data_C_0
  * data_A_1 data_B_1 data_C_0
  * data_A_0 data_B_0 data_C_1
  * data_A_1 data_B_0 data_C_1
  * data_A_0 data_B_1 data_C_1
  * data_A_1 data_B_1 data_C_1
  *
  * Note that the split segments may be on the same or different top-level
  * vdevs. In either case, we may need to try lots of combinations (see
  * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
  * has small silent errors on all of its children, we can still reconstruct
  * the correct data, as long as those errors are at sufficiently-separated
  * offsets (specifically, separated by the largest block size - default of
  * 128KB, but up to 16MB).
  */
 static void
 vdev_indirect_reconstruct_io_done(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 	boolean_t known_good = B_FALSE;
 	int error;
 
 	iv->iv_unique_combinations = 1;
 	iv->iv_attempts_max = UINT64_MAX;
 
 	if (zfs_reconstruct_indirect_combinations_max > 0)
 		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
 
 	/*
 	 * If nonzero, every 1/x blocks will be damaged, in order to validate
 	 * reconstruction when there are split segments with damaged copies.
 	 * Known_good will be TRUE when reconstruction is known to be possible.
 	 */
 	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
 	    random_in_range(zfs_reconstruct_indirect_damage_fraction) == 0)
 		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
 
 	/*
 	 * Determine the unique children for a split segment and add them
 	 * to the is_unique_child list.  By restricting reconstruction
 	 * to these children, only unique combinations will be considered.
 	 * This can vastly reduce the search space when there are a large
 	 * number of indirect splits.
 	 */
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		is->is_unique_children = 0;
 
 		for (int i = 0; i < is->is_children; i++) {
 			indirect_child_t *ic_i = &is->is_child[i];
 
 			if (ic_i->ic_data == NULL ||
 			    ic_i->ic_duplicate != NULL)
 				continue;
 
 			for (int j = i + 1; j < is->is_children; j++) {
 				indirect_child_t *ic_j = &is->is_child[j];
 
 				if (ic_j->ic_data == NULL ||
 				    ic_j->ic_duplicate != NULL)
 					continue;
 
 				if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
 					ic_j->ic_duplicate = ic_i;
 			}
 
 			is->is_unique_children++;
 			list_insert_tail(&is->is_unique_child, ic_i);
 		}
 
 		/* Reconstruction is impossible, no valid children */
 		EQUIV(list_is_empty(&is->is_unique_child),
 		    is->is_unique_children == 0);
 		if (list_is_empty(&is->is_unique_child)) {
 			zio->io_error = EIO;
 			vdev_indirect_all_checksum_errors(zio);
 			zio_checksum_verified(zio);
 			return;
 		}
 
 		iv->iv_unique_combinations *= is->is_unique_children;
 	}
 
 	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
 		error = vdev_indirect_splits_enumerate_all(iv, zio);
 	else
 		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
 
 	if (error != 0) {
 		/* All attempted combinations failed. */
 		ASSERT3B(known_good, ==, B_FALSE);
 		zio->io_error = error;
 		vdev_indirect_all_checksum_errors(zio);
 	} else {
 		/*
 		 * The checksum has been successfully validated.  Issue
 		 * repair I/Os to any copies of splits which don't match
 		 * the validated version.
 		 */
 		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
 		vdev_indirect_repair(zio);
 		zio_checksum_verified(zio);
 	}
 }
 
 static void
 vdev_indirect_io_done(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
 	if (iv->iv_reconstruct) {
 		/*
 		 * We have read all copies of the data (e.g. from mirrors),
 		 * either because this was a scrub/resilver, or because the
 		 * one-copy read didn't checksum correctly.
 		 */
 		vdev_indirect_reconstruct_io_done(zio);
 		return;
 	}
 
 	if (!iv->iv_split_block) {
 		/*
 		 * This was not a split block, so we passed the BP down,
 		 * and the checksum was handled by the (one) child zio.
 		 */
 		return;
 	}
 
 	zio_bad_cksum_t zbc;
 	int ret = zio_checksum_error(zio, &zbc);
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level VDEV.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		ret = 0;
 	}
 
 	if (ret == 0) {
 		zio_checksum_verified(zio);
 		return;
 	}
 
 	/*
 	 * The checksum didn't match.  Read all copies of all splits, and
 	 * then we will try to reconstruct.  The next time
 	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
 	 */
 	vdev_indirect_read_all(zio);
 
 	zio_vdev_io_redone(zio);
 }
 
 vdev_ops_t vdev_indirect_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_indirect_open,
 	.vdev_op_close = vdev_indirect_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_indirect_io_start,
 	.vdev_op_io_done = vdev_indirect_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = vdev_indirect_remap,
 	.vdev_op_xlate = NULL,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* leaf vdev */
 };
 
 EXPORT_SYMBOL(spa_condense_fini);
 EXPORT_SYMBOL(spa_start_indirect_condensing_thread);
 EXPORT_SYMBOL(spa_condense_indirect_start_sync);
 EXPORT_SYMBOL(spa_condense_init);
 EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete);
 EXPORT_SYMBOL(vdev_indirect_mark_obsolete);
 EXPORT_SYMBOL(vdev_indirect_should_condense);
 EXPORT_SYMBOL(vdev_indirect_sync_obsolete);
 EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
 EXPORT_SYMBOL(vdev_obsolete_sm_object);
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT,
 	ZMOD_RW, "Whether to attempt condensing indirect vdev mappings");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_obsolete_pct, UINT,
 	ZMOD_RW,
 	"Minimum obsolete percent of bytes in the mapping "
 	"to attempt condensing");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, U64, ZMOD_RW,
 	"Don't bother condensing if the mapping uses less than this amount of "
 	"memory");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, U64,
 	ZMOD_RW,
 	"Minimum size obsolete spacemap to attempt condensing");
 
 ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms,
 	UINT, ZMOD_RW,
 	"Used by tests to ensure certain actions happen in the middle of a "
 	"condense. A maximum value of 1 should be sufficient.");
 
 ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max,
 	UINT, ZMOD_RW,
 	"Maximum number of combinations when reconstructing split segments");
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 9f3dfce01799..a6aee9437066 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -1,1060 +1,1063 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/abd.h>
 #include <sys/fs/zfs.h>
 
 /*
  * Vdev mirror kstats
  */
 static kstat_t *mirror_ksp = NULL;
 
 typedef struct mirror_stats {
 	kstat_named_t vdev_mirror_stat_rotating_linear;
 	kstat_named_t vdev_mirror_stat_rotating_offset;
 	kstat_named_t vdev_mirror_stat_rotating_seek;
 	kstat_named_t vdev_mirror_stat_non_rotating_linear;
 	kstat_named_t vdev_mirror_stat_non_rotating_seek;
 
 	kstat_named_t vdev_mirror_stat_preferred_found;
 	kstat_named_t vdev_mirror_stat_preferred_not_found;
 } mirror_stats_t;
 
 static mirror_stats_t mirror_stats = {
 	/* New I/O follows directly the last I/O */
 	{ "rotating_linear",			KSTAT_DATA_UINT64 },
 	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
 	{ "rotating_offset",			KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek */
 	{ "rotating_seek",			KSTAT_DATA_UINT64 },
 	/* New I/O follows directly the last I/O  (nonrot) */
 	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
 	/* New I/O requires random seek (nonrot) */
 	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev found */
 	{ "preferred_found",			KSTAT_DATA_UINT64 },
 	/* Preferred child vdev not found or equal load  */
 	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
 
 };
 
 #define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
 #define	MIRROR_INCR(stat, val) 		atomic_add_64(&MIRROR_STAT(stat), val)
 #define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
 
 void
 vdev_mirror_stat_init(void)
 {
 	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
 	    "misc", KSTAT_TYPE_NAMED,
 	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (mirror_ksp != NULL) {
 		mirror_ksp->ks_data = &mirror_stats;
 		kstat_install(mirror_ksp);
 	}
 }
 
 void
 vdev_mirror_stat_fini(void)
 {
 	if (mirror_ksp != NULL) {
 		kstat_delete(mirror_ksp);
 		mirror_ksp = NULL;
 	}
 }
 
 /*
  * Virtual device vector for mirroring.
  */
 typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	abd_t		*mc_abd;
 	uint64_t	mc_offset;
 	int		mc_error;
 	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 	uint8_t		mc_rebuilding;
 } mirror_child_t;
 
 typedef struct mirror_map {
 	int		*mm_preferred;
 	int		mm_preferred_cnt;
 	int		mm_children;
 	boolean_t	mm_resilvering;
 	boolean_t	mm_rebuilding;
 	boolean_t	mm_root;
 	mirror_child_t	mm_child[];
 } mirror_map_t;
 
 static const int vdev_mirror_shift = 21;
 
 /*
  * The load configuration settings below are tuned by default for
  * the case where all devices are of the same rotational type.
  *
  * If there is a mixture of rotating and non-rotating media, setting
  * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
  * as it will direct more reads to the non-rotating vdevs which are more likely
  * to have a higher performance.
  */
 
 /* Rotating media load calculation configuration. */
 static int zfs_vdev_mirror_rotating_inc = 0;
 static int zfs_vdev_mirror_rotating_seek_inc = 5;
 static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
 
 /* Non-rotating media load calculation configuration. */
 static int zfs_vdev_mirror_non_rotating_inc = 0;
 static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
 
 static inline size_t
 vdev_mirror_map_size(int children)
 {
 	return (offsetof(mirror_map_t, mm_child[children]) +
 	    sizeof (int) * children);
 }
 
 static inline mirror_map_t *
 vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
 {
 	mirror_map_t *mm;
 
 	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
 	mm->mm_children = children;
 	mm->mm_resilvering = resilvering;
 	mm->mm_root = root;
 	mm->mm_preferred = (int *)((uintptr_t)mm +
 	    offsetof(mirror_map_t, mm_child[children]));
 
 	return (mm);
 }
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
 	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	.vsd_free = vdev_mirror_map_free,
 };
 
 static int
 vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
 {
 	uint64_t last_offset;
 	int64_t offset_diff;
 	int load;
 
 	/* All DVAs have equal weight at the root. */
 	if (mm->mm_root)
 		return (INT_MAX);
 
 	/*
 	 * We don't return INT_MAX if the device is resilvering i.e.
 	 * vdev_resilver_txg != 0 as when tested performance was slightly
 	 * worse overall when resilvering with compared to without.
 	 */
 
 	/* Fix zio_offset for leaf vdevs */
 	if (vd->vdev_ops->vdev_op_leaf)
 		zio_offset += VDEV_LABEL_START_SIZE;
 
 	/* Standard load based on pending queue length. */
 	load = vdev_queue_length(vd);
 	last_offset = vdev_queue_last_offset(vd);
 
 	if (vd->vdev_nonrot) {
 		/* Non-rotating media. */
 		if (last_offset == zio_offset) {
 			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
 			return (load + zfs_vdev_mirror_non_rotating_inc);
 		}
 
 		/*
 		 * Apply a seek penalty even for non-rotating devices as
 		 * sequential I/O's can be aggregated into fewer operations on
 		 * the device, thus avoiding unnecessary per-command overhead
 		 * and boosting performance.
 		 */
 		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
 		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
 	}
 
 	/* Rotating media I/O's which directly follow the last I/O. */
 	if (last_offset == zio_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
 		return (load + zfs_vdev_mirror_rotating_inc);
 	}
 
 	/*
 	 * Apply half the seek increment to I/O's within seek offset
 	 * of the last I/O issued to this vdev as they should incur less
 	 * of a seek increment.
 	 */
 	offset_diff = (int64_t)(last_offset - zio_offset);
 	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
 		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
 		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
 	}
 
 	/* Apply the full seek increment to all other I/O's. */
 	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
 	return (load + zfs_vdev_mirror_rotating_seek_inc);
 }
 
 static boolean_t
 vdev_mirror_rebuilding(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Avoid inlining the function to keep vdev_mirror_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline static mirror_map_t *
 vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
 	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		dva_t dva_copy[SPA_DVAS_PER_BP];
 
 		/*
 		 * The sequential scrub code sorts and issues all DVAs
 		 * of a bp separately. Each of these IOs includes all
 		 * original DVA copies so that repairs can be performed
 		 * in the event of an error, but we only actually want
 		 * to check the first DVA since the others will be
 		 * checked by their respective sorted IOs. Only if we
 		 * hit an error will we try all DVAs upon retrying.
 		 *
 		 * Note: This check is safe even if the user switches
 		 * from a legacy scrub to a sequential one in the middle
 		 * of processing, since scn_is_sorted isn't updated until
 		 * all outstanding IOs from the previous scrub pass
 		 * complete.
 		 */
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
 		    dsl_scan_scrubbing(spa->spa_dsl_pool) &&
 		    scn->scn_is_sorted) {
 			c = 1;
 		} else {
 			c = BP_GET_NDVAS(zio->io_bp);
 		}
 
 		/*
 		 * If the pool cannot be written to, then infer that some
 		 * DVAs might be invalid or point to vdevs that do not exist.
 		 * We skip them.
 		 */
 		if (!spa_writeable(spa)) {
 			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 			int j = 0;
 			for (int i = 0; i < c; i++) {
 				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
 					dva_copy[j++] = dva[i];
 			}
 			if (j == 0) {
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 			if (j < c) {
 				dva = dva_copy;
 				c = j;
 			}
 		}
 
 		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 			if (mc->mc_vd == NULL) {
 				kmem_free(mm, vdev_mirror_map_size(
 				    mm->mm_children));
 				zio->io_vsd = NULL;
 				zio->io_error = ENXIO;
 				return (NULL);
 			}
 		}
 	} else {
 		/*
 		 * If we are resilvering, then we should handle scrub reads
 		 * differently; we shouldn't issue them to the resilvering
 		 * device because it might not have those blocks.
 		 *
 		 * We are resilvering iff:
 		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
 		 *    "spare-1" or something like that), and
 		 * 2) The pool is currently being resilvered.
 		 *
 		 * We cannot simply check vd->vdev_resilver_txg, because it's
 		 * not set in this path.
 		 *
 		 * Nor can we just check our vdev_ops; there are cases (such as
 		 * when a user types "zpool replace pool odev spare_dev" and
 		 * spare_dev is in the spare list, or when a spare device is
 		 * automatically used to replace a DEGRADED device) when
 		 * resilvering is complete but both the original vdev and the
 		 * spare vdev remain in the pool.  That behavior is intentional.
 		 * It helps implement the policy that a spare should be
 		 * automatically removed from the pool after the user replaces
 		 * the device that originally failed.
 		 *
 		 * If a spa load is in progress, then spa_dsl_pool may be
 		 * uninitialized.  But we shouldn't be resilvering during a spa
 		 * load anyway.
 		 */
 		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops) &&
 		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
 		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
 		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
 		    B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
 
 			if (vdev_mirror_rebuilding(mc->mc_vd))
 				mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
 		}
 	}
 
 	return (mm);
 }
 
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
 		if (vdev_children_are_offline(vd))
 			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
 		else
 			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_mirror_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 static void
 vdev_mirror_child_done(zio_t *zio)
 {
 	mirror_child_t *mc = zio->io_private;
 
 	mc->mc_error = zio->io_error;
 	mc->mc_tried = 1;
 	mc->mc_skipped = 0;
 }
 
 /*
  * Check the other, lower-index DVAs to see if they're on the same
  * vdev as the child we picked.  If they are, use them since they
  * are likely to have been allocated from the primary metaslab in
  * use at the time, and hence are more likely to have locality with
  * single-copy data.
  */
 static int
 vdev_mirror_dva_select(zio_t *zio, int p)
 {
 	dva_t *dva = zio->io_bp->blk_dva;
 	mirror_map_t *mm = zio->io_vsd;
 	int preferred;
 	int c;
 
 	preferred = mm->mm_preferred[p];
 	for (p--; p >= 0; p--) {
 		c = mm->mm_preferred[p];
 		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
 			preferred = c;
 	}
 	return (preferred);
 }
 
 static int
 vdev_mirror_preferred_child_randomize(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	int p;
 
 	if (mm->mm_root) {
 		p = random_in_range(mm->mm_preferred_cnt);
 		return (vdev_mirror_dva_select(zio, p));
 	}
 
 	/*
 	 * To ensure we don't always favour the first matching vdev,
 	 * which could lead to wear leveling issues on SSD's, we
 	 * use the I/O offset as a pseudo random seed into the vdevs
 	 * which have the lowest load.
 	 */
 	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
 	return (mm->mm_preferred[p]);
 }
 
 static boolean_t
 vdev_mirror_child_readable(mirror_child_t *mc)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_readable(vd, mc->mc_offset));
 	else
 		return (vdev_readable(vd));
 }
 
 static boolean_t
 vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
 {
 	vdev_t *vd = mc->mc_vd;
 
 	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
 		return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
 	else
 		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
 }
 
 /*
  * Try to find a vdev whose DTL doesn't contain the block we want to read
  * preferring vdevs based on determined load. If we can't, try the read on
  * any vdev we haven't already tried.
  *
  * Distributed spares are an exception to the above load rule. They are
  * always preferred in order to detect gaps in the distributed spare which
  * are created when another disk in the dRAID fails. In order to restore
  * redundancy those gaps must be read to trigger the required repair IO.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
 	for (c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc;
 
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
 
 		if (mc->mc_vd == NULL ||
 		    !vdev_mirror_child_readable(mc)) {
 			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
 
 		if (vdev_mirror_child_missing(mc, txg, 1)) {
 			mc->mc_error = SET_ERROR(ESTALE);
 			mc->mc_skipped = 1;
 			mc->mc_speculative = 1;
 			continue;
 		}
 
 		if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
 			mm->mm_preferred[0] = c;
 			mm->mm_preferred_cnt = 1;
 			break;
 		}
 
 		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
 		if (mc->mc_load > lowest_load)
 			continue;
 
 		if (mc->mc_load < lowest_load) {
 			lowest_load = mc->mc_load;
 			mm->mm_preferred_cnt = 0;
 		}
 		mm->mm_preferred[mm->mm_preferred_cnt] = c;
 		mm->mm_preferred_cnt++;
 	}
 
 	if (mm->mm_preferred_cnt == 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_found);
 		return (mm->mm_preferred[0]);
 	}
 
 	if (mm->mm_preferred_cnt > 1) {
 		MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
 		return (vdev_mirror_preferred_child_randomize(zio));
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
 	for (c = 0; c < mm->mm_children; c++) {
 		if (!mm->mm_child[c].mc_tried)
 			return (c);
 	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
 	 */
 	return (-1);
 }
 
 static void
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
 	mirror_child_t *mc;
 	int c, children;
 
 	mm = vdev_mirror_map_init(zio);
 	zio->io_vsd = mm;
 	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
 
 	if (mm == NULL) {
 		ASSERT(!spa_trust_config(zio->io_spa));
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		zio_execute(zio);
 		return;
 	}
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
 			/*
 			 * For scrubbing reads we need to issue reads to all
 			 * children.  One child can reuse parent buffer, but
 			 * for others we have to allocate separate ones to
 			 * verify checksums if io_bp is non-NULL, or compare
 			 * them in vdev_mirror_io_done() otherwise.
 			 */
 			boolean_t first = B_TRUE;
 			for (c = 0; c < mm->mm_children; c++) {
 				mc = &mm->mm_child[c];
 
 				/* Don't issue ZIOs to offline children */
 				if (!vdev_mirror_child_readable(mc)) {
 					mc->mc_error = SET_ERROR(ENXIO);
 					mc->mc_tried = 1;
 					mc->mc_skipped = 1;
 					continue;
 				}
 
 				mc->mc_abd = first ? zio->io_abd :
 				    abd_alloc_sametype(zio->io_abd,
 				    zio->io_size);
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 				    mc->mc_vd, mc->mc_offset, mc->mc_abd,
 				    zio->io_size, zio->io_type,
 				    zio->io_priority, 0,
 				    vdev_mirror_child_done, mc));
 				first = B_FALSE;
 			}
 			zio_execute(zio);
 			return;
 		}
 		/*
 		 * For normal reads just pick one child.
 		 */
 		c = vdev_mirror_child_select(zio);
 		children = (c >= 0);
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
 		 * Writes go to all children.
 		 */
 		c = 0;
 		children = mm->mm_children;
 	}
 
 	while (children--) {
 		mc = &mm->mm_child[c];
 		c++;
 
 		/*
 		 * When sequentially resilvering only issue write repair
 		 * IOs to the vdev which is being rebuilt since performance
 		 * is limited by the slowest child.  This is an issue for
 		 * faster replacement devices such as distributed spares.
 		 */
 		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
 		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
 		    mm->mm_rebuilding && !mc->mc_rebuilding) {
 			continue;
 		}
 
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 	}
 
 	zio_execute(zio);
 }
 
 static int
 vdev_mirror_worst_error(mirror_map_t *mm)
 {
 	int error[2] = { 0, 0 };
 
 	for (int c = 0; c < mm->mm_children; c++) {
 		mirror_child_t *mc = &mm->mm_child[c];
 		int s = mc->mc_speculative;
 		error[s] = zio_worst_error(error[s], mc->mc_error);
 	}
 
 	return (error[0] ? error[0] : error[1]);
 }
 
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 	mirror_child_t *mc;
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
 	int last_good_copy = -1;
 
 	if (mm == NULL)
 		return;
 
 	for (c = 0; c < mm->mm_children; c++) {
 		mc = &mm->mm_child[c];
 
 		if (mc->mc_error) {
 			if (!mc->mc_skipped)
 				unexpected_errors++;
 		} else if (mc->mc_tried) {
 			last_good_copy = c;
 			good_copies++;
 		}
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
 		 *
 		 * Now that we support write reallocation, it would be better
 		 * to treat partial failure as real failure unless there are
 		 * no non-degraded top-level vdevs left, and not update DTLs
 		 * if we intend to reallocate.
 		 */
 		if (good_copies != mm->mm_children) {
 			/*
 			 * Always require at least one good copy.
 			 *
 			 * For ditto blocks (io_vd == NULL), require
 			 * all copies to be good.
 			 *
 			 * XXX -- for replacing vdevs, there's no great answer.
 			 * If the old device is really dead, we may not even
 			 * be able to access it -- so we only want to
 			 * require good writes to the new device.  But if
 			 * the new device turns out to be flaky, we want
 			 * to be able to detach it -- which requires all
 			 * writes to the old device to have succeeded.
 			 */
 			if (good_copies == 0 || zio->io_vd == NULL)
 				zio->io_error = vdev_mirror_worst_error(mm);
 		}
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level Mirror VDEV.
 	 *
 	 * There will be no attampt at reading any additional data copies. If
 	 * the buffer is still being manipulated while attempting to read from
 	 * another child, there exists a possibly that the checksum could be
 	 * verified as valid. However, the buffer contents could again get
 	 * manipulated after verifying the checksum. This would lead to bad data
 	 * being written out during self healing.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_DIO_READ) &&
 	    (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 		zio_dio_chksum_verify_error_report(zio);
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT3U(zio->io_error, ==, ECKSUM);
 		return;
 	}
 
 	/*
 	 * If we don't have a good copy yet, keep trying other children.
 	 */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
 		ASSERT(c >= 0 && c < mm->mm_children);
 		mc = &mm->mm_child[c];
 		zio_vdev_io_redone(zio);
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, 0,
 		    vdev_mirror_child_done, mc));
 		return;
 	}
 
 	if (zio->io_flags & ZIO_FLAG_SCRUB && !mm->mm_resilvering) {
 		abd_t *best_abd = NULL;
 		if (last_good_copy >= 0)
 			best_abd = mm->mm_child[last_good_copy].mc_abd;
 
 		/*
 		 * If we're scrubbing but don't have a BP available (because
 		 * this vdev is under a raidz or draid vdev) then the best we
 		 * can do is compare all of the copies read.  If they're not
 		 * identical then return a checksum error and the most likely
 		 * correct data.  The raidz code will issue a repair I/O if
 		 * possible.
 		 */
 		if (zio->io_bp == NULL) {
 			ASSERT(zio->io_vd->vdev_ops == &vdev_replacing_ops ||
 			    zio->io_vd->vdev_ops == &vdev_spare_ops);
 
 			abd_t *pref_abd = NULL;
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 
 				if (abd_cmp(mc->mc_abd, best_abd) != 0)
 					zio->io_error = SET_ERROR(ECKSUM);
 
 				/*
 				 * The distributed spare is always prefered
 				 * by vdev_mirror_child_select() so it's
 				 * considered to be the best candidate.
 				 */
 				if (pref_abd == NULL &&
 				    mc->mc_vd->vdev_ops ==
 				    &vdev_draid_spare_ops)
 					pref_abd = mc->mc_abd;
 
 				/*
 				 * In the absence of a preferred copy, use
 				 * the parent pointer to avoid a memory copy.
 				 */
 				if (mc->mc_abd == zio->io_abd)
 					best_abd = mc->mc_abd;
 			}
 			if (pref_abd)
 				best_abd = pref_abd;
 		} else {
 
 			/*
 			 * If we have a BP available, then checksums are
 			 * already verified and we just need a buffer
 			 * with valid data, preferring parent one to
 			 * avoid a memory copy.
 			 */
 			for (c = 0; c < last_good_copy; c++) {
 				mc = &mm->mm_child[c];
 				if (mc->mc_error || !mc->mc_tried)
 					continue;
 				if (mc->mc_abd == zio->io_abd) {
 					best_abd = mc->mc_abd;
 					break;
 				}
 			}
 		}
 
 		if (best_abd && best_abd != zio->io_abd)
 			abd_copy(zio->io_abd, best_abd, zio->io_size);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			if (mc->mc_abd != zio->io_abd)
 				abd_free(mc->mc_abd);
 			mc->mc_abd = NULL;
 		}
 	}
 
 	if (good_copies == 0) {
 		zio->io_error = vdev_mirror_worst_error(mm);
 		ASSERT(zio->io_error != 0);
 	}
 
 	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
 			 * Not only is it unnecessary, it could
 			 * actually be harmful: if the system lost
 			 * power while rewriting the only good copy,
 			 * there would be no good copies left!
 			 */
 			mc = &mm->mm_child[c];
 
 			if (mc->mc_error == 0) {
 				vdev_ops_t *ops = mc->mc_vd->vdev_ops;
 
 				if (mc->mc_tried)
 					continue;
 				/*
 				 * We didn't try this child.  We need to
 				 * repair it if:
 				 * 1. it's a scrub (in which case we have
 				 * tried everything that was healthy)
 				 *  - or -
 				 * 2. it's an indirect or distributed spare
 				 * vdev (in which case it could point to any
 				 * other vdev, which might have a bad DTL)
 				 *  - or -
 				 * 3. the DTL indicates that this data is
 				 * missing from this vdev
 				 */
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
 				    ops != &vdev_indirect_ops &&
 				    ops != &vdev_draid_spare_ops &&
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = SET_ERROR(ESTALE);
 			}
 
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
 
 static void
 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (faulted == vd->vdev_children) {
 		if (vdev_children_are_offline(vd)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
 			    VDEV_AUX_CHILDREN_OFFLINE);
 		} else {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_NO_REPLICAS);
 		}
 	} else if (degraded + faulted != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 	}
 }
 
 /*
  * Return the maximum asize for a rebuild zio in the provided range.
  */
 static uint64_t
 vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
     uint64_t max_segment)
 {
 	(void) start;
 
 	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
 	    SPA_MAXBLOCKSIZE);
 
 	return (MIN(asize, vdev_psize_to_asize(vd, psize)));
 }
 
 vdev_ops_t vdev_mirror_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_replacing_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 vdev_ops_t vdev_spare_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_mirror_open,
 	.vdev_op_close = vdev_mirror_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_mirror_io_start,
 	.vdev_op_io_done = vdev_mirror_io_done,
 	.vdev_op_state_change = vdev_mirror_state_change,
 	.vdev_op_need_resilver = vdev_default_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
 	"Rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT,
 	ZMOD_RW, "Rotating media load increment for seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT,
 	ZMOD_RW,
 	"Offset in bytes from the last I/O which triggers "
 	"a reduced rotating media seek increment");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for non-seeking I/Os");
 
 ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT,
 	ZMOD_RW, "Non-rotating media load increment for seeking I/Os");
diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c
index 89786a1dfd96..c62faef2d05c 100644
--- a/module/zfs/vdev_missing.c
+++ b/module/zfs/vdev_missing.c
@@ -1,131 +1,133 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 /*
  * The 'missing' vdev is a special vdev type used only during import.  It
  * signifies a placeholder in the root vdev for some vdev that we know is
  * missing.  We pass it down to the kernel to allow the rest of the
  * configuration to parsed and an attempt made to open all available devices.
  * Because its GUID is always 0, we know that the guid sum will mismatch and we
  * won't be able to open the pool anyway.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 
 static int
 vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift, uint64_t *pshift)
 {
 	/*
 	 * Really this should just fail.  But then the root vdev will be in the
 	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
 	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
 	 * will fail the GUID sum check before ever trying to open the pool.
 	 */
 	(void) vd;
 	*psize = 0;
 	*max_psize = 0;
 	*ashift = 0;
 	*pshift = 0;
 	return (0);
 }
 
 static void
 vdev_missing_close(vdev_t *vd)
 {
 	(void) vd;
 }
 
 static void
 vdev_missing_io_start(zio_t *zio)
 {
 	zio->io_error = SET_ERROR(ENOTSUP);
 	zio_execute(zio);
 }
 
 static void
 vdev_missing_io_done(zio_t *zio)
 {
 	(void) zio;
 }
 
 vdev_ops_t vdev_missing_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_missing_open,
 	.vdev_op_close = vdev_missing_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_missing_io_start,
 	.vdev_op_io_done = vdev_missing_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_MISSING,	/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 vdev_ops_t vdev_hole_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_missing_open,
 	.vdev_op_close = vdev_missing_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_missing_io_start,
 	.vdev_op_io_done = vdev_missing_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_HOLE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index b62dc6b0b91c..62d9c9909bd1 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1,5124 +1,5152 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_tx.h>
 #include <sys/abd.h>
 #include <sys/zfs_rlock.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/dsl_scan.h>
 
 #ifdef ZFS_DEBUG
 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
 #endif
 
 /*
  * Virtual device vector for RAID-Z.
  *
  * This vdev supports single, double, and triple parity. For single parity,
  * we use a simple XOR of all the data columns. For double or triple parity,
  * we use a special case of Reed-Solomon coding. This extends the
  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  * former is also based. The latter is designed to provide higher performance
  * for writes.
  *
  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  * amended six years later identifying a critical flaw that invalidates its
  * claims. Nevertheless, the technique can be adapted to work for up to
  * triple parity. For additional parity, the amendment "Note: Correction to
  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  * is viable, but the additional complexity means that write performance will
  * suffer.
  *
  * All of the methods above operate on a Galois field, defined over the
  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  * can be expressed with a single byte. Briefly, the operations on the
  * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
  *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
  *	(A * 2)_4 = A_3 + A_7
  *	(A * 2)_3 = A_2 + A_7
  *	(A * 2)_2 = A_1 + A_7
  *	(A * 2)_1 = A_0
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  * As an aside, this multiplication is derived from the error correcting
  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  * than field addition). The inverse of a field element A (A^-1) is therefore
  * A ^ (255 - 1) = A^254.
  *
  * The up-to-three parity columns, P, Q, R over several data columns,
  * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  * independent coefficients. (There are no additional coefficients that have
  * this property which is why the uncorrected Plank method breaks down.)
  *
  * See the reconstruction code below for how P, Q and R can used individually
  * or in concert to recover missing data columns.
  */
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
 #define	VDEV_RAIDZ_R		2
 
 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 
 /*
  * We provide a mechanism to perform the field multiplication operation on a
  * 64-bit value all at once rather than a byte at a time. This works by
  * creating a mask from the top bit in each byte and using that to
  * conditionally apply the XOR of 0x1d.
  */
 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
 { \
 	(mask) = (x) & 0x8080808080808080ULL; \
 	(mask) = ((mask) << 1) - ((mask) >> 7); \
 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 }
 
 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
 { \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
 
 /*
  * Big Theory Statement for how a RAIDZ VDEV is expanded
  *
  * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
  * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
  * that have been previously expanded can be expanded again.
  *
  * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
  * the VDEV) when an expansion starts.  And the expansion will pause if any
  * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
  * operations on the pool can continue while an expansion is in progress (e.g.
  * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
  * and zpool initialize which can't be run during an expansion.  Following a
  * reboot or export/import, the expansion resumes where it left off.
  *
  * == Reflowing the Data ==
  *
  * The expansion involves reflowing (copying) the data from the current set
  * of disks to spread it across the new set which now has one more disk. This
  * reflow operation is similar to reflowing text when the column width of a
  * text editor window is expanded. The text doesn’t change but the location of
  * the text changes to accommodate the new width. An example reflow result for
  * a 4-wide RAIDZ1 to a 5-wide is shown below.
  *
  *                            Reflow End State
  *            Each letter indicates a parity group (logical stripe)
  *
  *         Before expansion                         After Expansion
  *     D1     D2     D3     D4               D1     D2     D3     D4     D5
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
  *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
  *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
  *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
  *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
  *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
  *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
  *  +------+------+------+------+         +------+------+------+------+------+
  *  |      |      |      |      |         |      |      |      |      |      |
  *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
  *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
  *  +------+------+------+------+         +------+------+------+------+------+
  *
  * This reflow approach has several advantages. There is no need to read or
  * modify the block pointers or recompute any block checksums.  The reflow
  * doesn’t need to know where the parity sectors reside. We can read and write
  * data sequentially and the copy can occur in a background thread in open
  * context. The design also allows for fast discovery of what data to copy.
  *
  * The VDEV metaslabs are processed, one at a time, to copy the block data to
  * have it flow across all the disks. The metaslab is disabled for allocations
  * during the copy. As an optimization, we only copy the allocated data which
  * can be determined by looking at the metaslab range tree. During the copy we
  * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
  * need to be able to survive losing parity count disks).  This means we
  * cannot overwrite data during the reflow that would be needed if a disk is
  * lost.
  *
  * After the reflow completes, all newly-written blocks will have the new
  * layout, i.e., they will have the parity to data ratio implied by the new
  * number of disks in the RAIDZ group.  Even though the reflow copies all of
  * the allocated space (data and parity), it is only rearranged, not changed.
  *
  * This act of reflowing the data has a few implications about blocks
  * that were written before the reflow completes:
  *
  *  - Old blocks will still use the same amount of space (i.e., they will have
  *    the parity to data ratio implied by the old number of disks in the RAIDZ
  *    group).
  *  - Reading old blocks will be slightly slower than before the reflow, for
  *    two reasons. First, we will have to read from all disks in the RAIDZ
  *    VDEV, rather than being able to skip the children that contain only
  *    parity of this block (because the data of a single block is now spread
  *    out across all the disks).  Second, in most cases there will be an extra
  *    bcopy, needed to rearrange the data back to its original layout in memory.
  *
  * == Scratch Area ==
  *
  * As we copy the block data, we can only progress to the point that writes
  * will not overlap with blocks whose progress has not yet been recorded on
  * disk.  Since partially-copied rows are always read from the old location,
  * we need to stop one row before the sector-wise overlap, to prevent any
  * row-wise overlap. For example, in the diagram above, when we reflow sector
  * B6 it will overwite the original location for B5.
  *
  * To get around this, a scratch space is used so that we can start copying
  * without risking data loss by overlapping the row. As an added benefit, it
  * improves performance at the beginning of the reflow, but that small perf
  * boost wouldn't be worth the complexity on its own.
  *
  * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
  * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
  * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
  * the widths will likely be single digits so we can get a substantial chuck
  * size using only a few MB of scratch per disk.
  *
  * The scratch area is persisted to disk which holds a large amount of reflowed
  * state. We can always read the partially written stripes when a disk fails or
  * the copy is interrupted (crash) during the initial copying phase and also
  * get past a small chunk size restriction.  At a minimum, the scratch space
  * must be large enough to get us to the point that one row does not overlap
  * itself when moved (i.e new_width^2).  But going larger is even better. We
  * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
  * as our scratch space to handle overwriting the initial part of the VDEV.
  *
  *	0     256K   512K                    4M
  *	+------+------+-----------------------+-----------------------------
  *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
  *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
  *	+------+------+-----------------------+-------------------------------
  *                        Scratch Area
  *
  * == Reflow Progress Updates ==
  * After the initial scratch-based reflow, the expansion process works
  * similarly to device removal. We create a new open context thread which
  * reflows the data, and periodically kicks off sync tasks to update logical
  * state. In this case, state is the committed progress (offset of next data
  * to copy). We need to persist the completed offset on disk, so that if we
  * crash we know which format each VDEV offset is in.
  *
  * == Time Dependent Geometry ==
  *
  * In non-expanded RAIDZ, blocks are read from disk in a column by column
  * fashion. For a multi-row block, the second sector is in the first column
  * not in the second column. This allows us to issue full reads for each
  * column directly into the request buffer. The block data is thus laid out
  * sequentially in a column-by-column fashion.
  *
  * For example, in the before expansion diagram above, one logical block might
  * be sectors G19-H26. The parity is in G19,H23; and the data is in
  * G20,H24,G21,H25,G22,H26.
  *
  * After a block is reflowed, the sectors that were all in the original column
  * data can now reside in different columns. When reading from an expanded
  * VDEV, we need to know the logical stripe width for each block so we can
  * reconstitute the block’s data after the reads are completed. Likewise,
  * when we perform the combinatorial reconstruction we need to know the
  * original width so we can retry combinations from the past layouts.
  *
  * Time dependent geometry is what we call having blocks with different layouts
  * (stripe widths) in the same VDEV. This time-dependent geometry uses the
  * block’s birth time (+ the time expansion ended) to establish the correct
  * width for a given block. After an expansion completes, we record the time
  * for blocks written with a particular width (geometry).
  *
  * == On Disk Format Changes ==
  *
  * New pool feature flag, 'raidz_expansion' whose reference count is the number
  * of RAIDZ VDEVs that have been expanded.
  *
  * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
  *
  * Since the uberblock can point to arbitrary blocks, which might be on the
  * expanding RAIDZ, and might or might not have been expanded. We need to know
  * which way a block is laid out before reading it. This info is the next
  * offset that needs to be reflowed and we persist that in the uberblock, in
  * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
  * After the expansion is complete, we then use the raidz_expand_txgs array
  * (see below) to determine how to read a block and the ub_raidz_reflow_info
  * field no longer required.
  *
  * The uberblock's ub_raidz_reflow_info field also holds the scratch space
  * state (i.e., active or not) which is also required before reading a block
  * during the initial phase of reflowing the data.
  *
  * The top-level RAIDZ VDEV has two new entries in the nvlist:
  *
  * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
  *                            and used after the expansion is complete to
  *                            determine how to read a raidz block
  * 'raidz_expanding' boolean: present during reflow and removed after completion
  *                            used during a spa import to resume an unfinished
  *                            expansion
  *
  * And finally the VDEVs top zap adds the following informational entries:
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
  *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
  */
 
 /*
  * For testing only: pause the raidz expansion after reflowing this amount.
  * (accessed by ZTS and ztest)
  */
 #ifdef	_KERNEL
 static
 #endif	/* _KERNEL */
 unsigned long raidz_expand_max_reflow_bytes = 0;
 
 /*
  * For testing only: pause the raidz expansion at a certain point.
  */
 uint_t raidz_expand_pause_point = 0;
 
 /*
  * Maximum amount of copy io's outstanding at once.
  */
 #ifdef _ILP32
 static unsigned long raidz_expand_max_copy_bytes = SPA_MAXBLOCKSIZE;
 #else
 static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Apply raidz map abds aggregation if the number of rows in the map is equal
  * or greater than the value below.
  */
 static unsigned long raidz_io_aggregate_rows = 4;
 
 /*
  * Automatically start a pool scrub when a RAIDZ expansion completes in
  * order to verify the checksums of all blocks which have been copied
  * during the expansion.  Automatic scrubbing is enabled by default and
  * is strongly recommended.
  */
 static int zfs_scrub_after_expand = 1;
 
 static void
 vdev_raidz_row_free(raidz_row_t *rr)
 {
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_size != 0)
 			abd_free(rc->rc_abd);
 		if (rc->rc_orig_data != NULL)
 			abd_free(rc->rc_orig_data);
 	}
 
 	if (rr->rr_abd_empty != NULL)
 		abd_free(rr->rr_abd_empty);
 
 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
 }
 
 void
 vdev_raidz_map_free(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++)
 		vdev_raidz_row_free(rm->rm_row[i]);
 
 	if (rm->rm_nphys_cols) {
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			if (rm->rm_phys_col[i].rc_abd != NULL)
 				abd_free(rm->rm_phys_col[i].rc_abd);
 		}
 
 		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
 		    rm->rm_nphys_cols);
 	}
 
 	ASSERT3P(rm->rm_lr, ==, NULL);
 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
 }
 
 static void
 vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_map_free(rm);
 }
 
 static int
 vdev_raidz_reflow_compare(const void *x1, const void *x2)
 {
 	const reflow_node_t *l = x1;
 	const reflow_node_t *r = x2;
 
 	return (TREE_CMP(l->re_txg, r->re_txg));
 }
 
 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 	.vsd_free = vdev_raidz_map_free_vsd,
 };
 
 raidz_row_t *
 vdev_raidz_row_alloc(int cols, zio_t *zio)
 {
 	raidz_row_t *rr =
 	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
 
 	rr->rr_cols = cols;
 	rr->rr_scols = cols;
 
 	for (int c = 0; c < cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_shadow_devidx = INT_MAX;
 		rc->rc_shadow_offset = UINT64_MAX;
 		/*
 		 * We can not allow self healing to take place for Direct I/O
 		 * reads. There is nothing that stops the buffer contents from
 		 * being manipulated while the I/O is in flight. It is possible
 		 * that the checksum could be verified on the buffer and then
 		 * the contents of that buffer are manipulated afterwards. This
 		 * could lead to bad data being written out during self
 		 * healing.
 		 */
 		if (!(zio->io_flags & ZIO_FLAG_DIO_READ))
 			rc->rc_allow_repair = 1;
 	}
 	return (rr);
 }
 
 static void
 vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
 {
 	int c;
 	int nwrapped = 0;
 	uint64_t off = 0;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/*
 	 * Pad any parity columns with additional space to account for skip
 	 * sectors.
 	 */
 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
 		ASSERT0(rm->rm_skipstart);
 		nwrapped = rm->rm_nskip;
 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
 		nwrapped =
 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
 	}
 
 	/*
 	 * Optional single skip sectors (rc_size == 0) will be handled in
 	 * vdev_raidz_io_start_write().
 	 */
 	int skipped = rr->rr_scols - rr->rr_cols;
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * Parity columns will pad out a linear ABD to account for
 		 * the skip sector. A linear ABD is used here because
 		 * parity calculations use the ABD buffer directly to calculate
 		 * parity. This avoids doing a memcpy back to the ABD after the
 		 * parity has been calculated. By issuing the parity column
 		 * with the skip sector we can reduce contention on the child
 		 * VDEV queue locks (vq_lock).
 		 */
 		if (c < nwrapped) {
 			rc->rc_abd = abd_alloc_linear(
 			    rc->rc_size + (1ULL << ashift), B_FALSE);
 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
 			skipped++;
 		} else {
 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 		}
 	}
 
 	for (off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 
 		/*
 		 * Generate I/O for skip sectors to improve aggregation
 		 * continuity. We will use gang ABD's to reduce contention
 		 * on the child VDEV queue locks (vq_lock) by issuing
 		 * a single I/O that contains the data and skip sector.
 		 *
 		 * It is important to make sure that rc_size is not updated
 		 * even though we are adding a skip sector to the ABD. When
 		 * calculating the parity in vdev_raidz_generate_parity_row()
 		 * the rc_size is used to iterate through the ABD's. We can
 		 * not have zero'd out skip sectors used for calculating
 		 * parity for raidz, because those same sectors are not used
 		 * during reconstruction.
 		 */
 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
 			rc->rc_abd = abd_alloc_gang();
 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
 			abd_gang_add(rc->rc_abd,
 			    abd_get_zeros(1ULL << ashift), B_TRUE);
 			skipped++;
 		} else {
 			rc->rc_abd = abd;
 		}
 		off += rc->rc_size;
 	}
 
 	ASSERT3U(off, ==, zio->io_size);
 	ASSERT3S(skipped, ==, rm->rm_nskip);
 }
 
 static void
 vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
 {
 	int c;
 	raidz_row_t *rr = rm->rm_row[0];
 
 	ASSERT3U(rm->rm_nrows, ==, 1);
 
 	/* Allocate buffers for the parity columns */
 	for (c = 0; c < rr->rr_firstdatacol; c++)
 		rr->rr_col[c].rc_abd =
 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
 
 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
 		    zio->io_abd, off, rc->rc_size);
 		off += rc->rc_size;
 	}
 }
 
 /*
  * Divides the IO evenly across all child vdevs; usually, dcols is
  * the number of children in the target vdev.
  *
  * Avoid inlining the function to keep vdev_raidz_io_start(), which
  * is this functions only caller, as small as possible on the stack.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
     uint64_t nparity)
 {
 	raidz_row_t *rr;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = zio->io_offset >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = zio->io_size >> ashift;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << ashift;
 	uint64_t acols, scols;
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
 	rm->rm_nrows = 1;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 */
 	uint64_t q = s / (dcols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (dcols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/*
 	 * acols: The columns that will be accessed.
 	 * scols: The columns that will be accessed or skipped.
 	 */
 	if (q == 0) {
 		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
 		acols = dcols;
 		scols = dcols;
 	}
 
 	ASSERT3U(acols, <=, scols);
 	rr = vdev_raidz_row_alloc(scols, zio);
 	rm->rm_row[0] = rr;
 	rr->rr_cols = acols;
 	rr->rr_bigcols = bc;
 	rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 	rr->rr_offset = zio->io_offset;
 	rr->rr_size = zio->io_size;
 #endif
 
 	uint64_t asize = 0;
 
 	for (uint64_t c = 0; c < scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		uint64_t col = f + c;
 		uint64_t coff = o;
 		if (col >= dcols) {
 			col -= dcols;
 			coff += 1ULL << ashift;
 		}
 		rc->rc_devidx = col;
 		rc->rc_offset = coff;
 
 		if (c >= acols)
 			rc->rc_size = 0;
 		else if (c < bc)
 			rc->rc_size = (q + 1) << ashift;
 		else
 			rc->rc_size = q << ashift;
 
 		asize += rc->rc_size;
 	}
 
 	ASSERT3U(asize, ==, tot << ashift);
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
 	 * will always be on the same device and, since parity isn't read
 	 * during normal operation, that device's I/O bandwidth won't be
 	 * used effectively. We therefore switch the parity every 1MB.
 	 *
 	 * ... at least that was, ostensibly, the theory. As a practical
 	 * matter unless we juggle the parity between all devices evenly, we
 	 * won't see any benefit. Further, occasional writes that aren't a
 	 * multiple of the LCM of the number of children and the minimum
 	 * stripe width are sufficient to avoid pessimal behavior.
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
 	 *
 	 * If we intend to skip a sector in the zeroth column for padding
 	 * we must make sure to note this swap. We will never intend to
 	 * skip the first column since at least one data and one parity
 	 * column must appear in each row.
 	 */
 	ASSERT(rr->rr_cols >= 2);
 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 		uint64_t devidx = rr->rr_col[0].rc_devidx;
 		o = rr->rr_col[0].rc_offset;
 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 		rr->rr_col[1].rc_devidx = devidx;
 		rr->rr_col[1].rc_offset = o;
 		if (rm->rm_skipstart == 0)
 			rm->rm_skipstart = 1;
 	}
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_map_alloc_write(zio, rm, ashift);
 	} else {
 		vdev_raidz_map_alloc_read(zio, rm);
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 /*
  * Everything before reflow_offset_synced should have been moved to the new
  * location (read and write completed).  However, this may not yet be reflected
  * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
  * uberblock has not yet been written). If reflow is not in progress,
  * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
  * entirely before reflow_offset_synced, it will come from the new location.
  * Otherwise this row will come from the old location.  Therefore, rows that
  * straddle the reflow_offset_synced will come from the old location.
  *
  * For writes, reflow_offset_next is the next offset to copy.  If a sector has
  * been copied, but not yet reflected in the on-disk progress
  * (reflow_offset_synced), it will also be written to the new (already copied)
  * offset.
  */
 noinline raidz_map_t *
 vdev_raidz_map_alloc_expanded(zio_t *zio,
     uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
     uint64_t nparity, uint64_t reflow_offset_synced,
     uint64_t reflow_offset_next, boolean_t use_scratch)
 {
 	abd_t *abd = zio->io_abd;
 	uint64_t offset = zio->io_offset;
 	uint64_t size = zio->io_size;
 
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = size >> ashift;
 
 	/*
 	 * "Quotient": The number of data sectors for this stripe on all but
 	 * the "big column" child vdevs that also contain "remainder" data.
 	 * AKA "full rows"
 	 */
 	uint64_t q = s / (logical_cols - nparity);
 
 	/*
 	 * "Remainder": The number of partial stripe data sectors in this I/O.
 	 * This will add a sector to some, but not all, child vdevs.
 	 */
 	uint64_t r = s - q * (logical_cols - nparity);
 
 	/* The number of "big columns" - those which contain remainder data. */
 	uint64_t bc = (r == 0 ? 0 : r + nparity);
 
 	/*
 	 * The total number of data and parity sectors associated with
 	 * this I/O.
 	 */
 	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
 	/* How many rows contain data (not skip) */
 	uint64_t rows = howmany(tot, logical_cols);
 	int cols = MIN(tot, logical_cols);
 
 	raidz_map_t *rm =
 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
 	    KM_SLEEP);
 	rm->rm_nrows = rows;
 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 	rm->rm_skipstart = bc;
 	uint64_t asize = 0;
 
 	for (uint64_t row = 0; row < rows; row++) {
 		boolean_t row_use_scratch = B_FALSE;
 		raidz_row_t *rr = vdev_raidz_row_alloc(cols, zio);
 		rm->rm_row[row] = rr;
 
 		/* The starting RAIDZ (parent) vdev sector of the row. */
 		uint64_t b = (offset >> ashift) + row * logical_cols;
 
 		/*
 		 * If we are in the middle of a reflow, and the copying has
 		 * not yet completed for any part of this row, then use the
 		 * old location of this row.  Note that reflow_offset_synced
 		 * reflects the i/o that's been completed, because it's
 		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
 		 * This is sufficient for our check, even if that progress
 		 * has not yet been recorded to disk (reflected in
 		 * spa_ubsync).  Also note that we consider the last row to
 		 * be "full width" (`cols`-wide rather than `bc`-wide) for
 		 * this calculation. This causes a tiny bit of unnecessary
 		 * double-writes but is safe and simpler to calculate.
 		 */
 		int row_phys_cols = physical_cols;
 		if (b + cols > reflow_offset_synced >> ashift)
 			row_phys_cols--;
 		else if (use_scratch)
 			row_use_scratch = B_TRUE;
 
 		/* starting child of this row */
 		uint64_t child_id = b % row_phys_cols;
 		/* The starting byte offset on each child vdev. */
 		uint64_t child_offset = (b / row_phys_cols) << ashift;
 
 		/*
 		 * Note, rr_cols is the entire width of the block, even
 		 * if this row is shorter.  This is needed because parity
 		 * generation (for Q and R) needs to know the entire width,
 		 * because it treats the short row as though it was
 		 * full-width (and the "phantom" sectors were zero-filled).
 		 *
 		 * Another approach to this would be to set cols shorter
 		 * (to just the number of columns that we might do i/o to)
 		 * and have another mechanism to tell the parity generation
 		 * about the "entire width".  Reconstruction (at least
 		 * vdev_raidz_reconstruct_general()) would also need to
 		 * know about the "entire width".
 		 */
 		rr->rr_firstdatacol = nparity;
 #ifdef ZFS_DEBUG
 		/*
 		 * note: rr_size is PSIZE, not ASIZE
 		 */
 		rr->rr_offset = b << ashift;
 		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
 #endif
 
 		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
 			if (child_id >= row_phys_cols) {
 				child_id -= row_phys_cols;
 				child_offset += 1ULL << ashift;
 			}
 			raidz_col_t *rc = &rr->rr_col[c];
 			rc->rc_devidx = child_id;
 			rc->rc_offset = child_offset;
 
 			/*
 			 * Get this from the scratch space if appropriate.
 			 * This only happens if we crashed in the middle of
 			 * raidz_reflow_scratch_sync() (while it's running,
 			 * the rangelock prevents us from doing concurrent
 			 * io), and even then only during zpool import or
 			 * when the pool is imported readonly.
 			 */
 			if (row_use_scratch)
 				rc->rc_offset -= VDEV_BOOT_SIZE;
 
 			uint64_t dc = c - rr->rr_firstdatacol;
 			if (c < rr->rr_firstdatacol) {
 				rc->rc_size = 1ULL << ashift;
 
 				/*
 				 * Parity sectors' rc_abd's are set below
 				 * after determining if this is an aggregation.
 				 */
 			} else if (row == rows - 1 && bc != 0 && c >= bc) {
 				/*
 				 * Past the end of the block (even including
 				 * skip sectors).  This sector is part of the
 				 * map so that we have full rows for p/q parity
 				 * generation.
 				 */
 				rc->rc_size = 0;
 				rc->rc_abd = NULL;
 			} else {
 				/* "data column" (col excluding parity) */
 				uint64_t off;
 
 				if (c < bc || r == 0) {
 					off = dc * rows + row;
 				} else {
 					off = r * rows +
 					    (dc - r) * (rows - 1) + row;
 				}
 				rc->rc_size = 1ULL << ashift;
 				rc->rc_abd = abd_get_offset_struct(
 				    &rc->rc_abdstruct, abd, off << ashift,
 				    rc->rc_size);
 			}
 
 			if (rc->rc_size == 0)
 				continue;
 
 			/*
 			 * If any part of this row is in both old and new
 			 * locations, the primary location is the old
 			 * location. If this sector was already copied to the
 			 * new location, we need to also write to the new,
 			 * "shadow" location.
 			 *
 			 * Note, `row_phys_cols != physical_cols` indicates
 			 * that the primary location is the old location.
 			 * `b+c < reflow_offset_next` indicates that the copy
 			 * to the new location has been initiated. We know
 			 * that the copy has completed because we have the
 			 * rangelock, which is held exclusively while the
 			 * copy is in progress.
 			 */
 			if (row_use_scratch ||
 			    (row_phys_cols != physical_cols &&
 			    b + c < reflow_offset_next >> ashift)) {
 				rc->rc_shadow_devidx = (b + c) % physical_cols;
 				rc->rc_shadow_offset =
 				    ((b + c) / physical_cols) << ashift;
 				if (row_use_scratch)
 					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
 			}
 
 			asize += rc->rc_size;
 		}
 
 		/*
 		 * See comment in vdev_raidz_map_alloc()
 		 */
 		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
 		    (offset & (1ULL << 20))) {
 			ASSERT(rr->rr_cols >= 2);
 			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
 
 			int devidx0 = rr->rr_col[0].rc_devidx;
 			uint64_t offset0 = rr->rr_col[0].rc_offset;
 			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
 			uint64_t shadow_offset0 =
 			    rr->rr_col[0].rc_shadow_offset;
 
 			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
 			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
 			rr->rr_col[0].rc_shadow_devidx =
 			    rr->rr_col[1].rc_shadow_devidx;
 			rr->rr_col[0].rc_shadow_offset =
 			    rr->rr_col[1].rc_shadow_offset;
 
 			rr->rr_col[1].rc_devidx = devidx0;
 			rr->rr_col[1].rc_offset = offset0;
 			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
 			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
 		}
 	}
 	ASSERT3U(asize, ==, tot << ashift);
 
 	/*
 	 * Determine if the block is contiguous, in which case we can use
 	 * an aggregation.
 	 */
 	if (rows >= raidz_io_aggregate_rows) {
 		rm->rm_nphys_cols = physical_cols;
 		rm->rm_phys_col =
 		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
 		    KM_SLEEP);
 
 		/*
 		 * Determine the aggregate io's offset and size, and check
 		 * that the io is contiguous.
 		 */
 		for (int i = 0;
 		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 
 				if (rc->rc_size == 0)
 					continue;
 
 				if (prc->rc_size == 0) {
 					ASSERT0(prc->rc_offset);
 					prc->rc_offset = rc->rc_offset;
 				} else if (prc->rc_offset + prc->rc_size !=
 				    rc->rc_offset) {
 					/*
 					 * This block is not contiguous and
 					 * therefore can't be aggregated.
 					 * This is expected to be rare, so
 					 * the cost of allocating and then
 					 * freeing rm_phys_col is not
 					 * significant.
 					 */
 					kmem_free(rm->rm_phys_col,
 					    sizeof (raidz_col_t) *
 					    rm->rm_nphys_cols);
 					rm->rm_phys_col = NULL;
 					rm->rm_nphys_cols = 0;
 					break;
 				}
 				prc->rc_size += rc->rc_size;
 			}
 		}
 	}
 	if (rm->rm_phys_col != NULL) {
 		/*
 		 * Allocate aggregate ABD's.
 		 */
 		for (int i = 0; i < rm->rm_nphys_cols; i++) {
 			raidz_col_t *prc = &rm->rm_phys_col[i];
 
 			prc->rc_devidx = i;
 
 			if (prc->rc_size == 0)
 				continue;
 
 			prc->rc_abd =
 			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
 			    B_FALSE);
 		}
 
 		/*
 		 * Point the parity abd's into the aggregate abd's.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				raidz_col_t *prc =
 				    &rm->rm_phys_col[rc->rc_devidx];
 				rc->rc_abd =
 				    abd_get_offset_struct(&rc->rc_abdstruct,
 				    prc->rc_abd,
 				    rc->rc_offset - prc->rc_offset,
 				    rc->rc_size);
 			}
 		}
 	} else {
 		/*
 		 * Allocate new abd's for the parity sectors.
 		 */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			for (int c = 0; c < rr->rr_firstdatacol; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				rc->rc_abd =
 				    abd_alloc_linear(rc->rc_size,
 				    B_TRUE);
 			}
 		}
 	}
 	/* init RAIDZ parity ops */
 	rm->rm_ops = vdev_raidz_math_get_ops();
 
 	return (rm);
 }
 
 struct pqr_struct {
 	uint64_t *p;
 	uint64_t *q;
 	uint64_t *r;
 };
 
 static int
 vdev_raidz_p_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && !pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++)
 		*pqr->p ^= *src;
 
 	return (0);
 }
 
 static int
 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && !pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 {
 	struct pqr_struct *pqr = private;
 	const uint64_t *src = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (src[0]);
 
 	ASSERT(pqr->p && pqr->q && pqr->r);
 
 	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 		*pqr->p ^= *src;
 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 		*pqr->q ^= *src;
 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 		*pqr->r ^= *src;
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_generate_parity_p(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 		} else {
 			struct pqr_struct pqr = { p, NULL, NULL };
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_p_func, &pqr);
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, NULL };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pq_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
 {
 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		abd_t *src = rr->rr_col[c].rc_abd;
 
 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
 
 		if (c == rr->rr_firstdatacol) {
 			ASSERT(ccnt == pcnt || ccnt == 0);
 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
 
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				p[i] = 0;
 				q[i] = 0;
 				r[i] = 0;
 			}
 		} else {
 			struct pqr_struct pqr = { p, q, r };
 
 			ASSERT(ccnt <= pcnt);
 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
 			    vdev_raidz_pqr_func, &pqr);
 
 			/*
 			 * Treat short columns as though they are full of 0s.
 			 * Note that there's therefore nothing needed for P.
 			 */
 			uint64_t mask;
 			for (uint64_t i = ccnt; i < pcnt; i++) {
 				VDEV_RAIDZ_64MUL_2(q[i], mask);
 				VDEV_RAIDZ_64MUL_4(r[i], mask);
 			}
 		}
 	}
 }
 
 /*
  * Generate RAID parity in the first virtual columns according to the number of
  * parity columns available.
  */
 void
 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
 {
 	if (rr->rr_cols == 0) {
 		/*
 		 * We are handling this block one row at a time (because
 		 * this block has a different logical vs physical width,
 		 * due to RAIDZ expansion), and this is a pad-only row,
 		 * which has no parity.
 		 */
 		return;
 	}
 
 	/* Generate using the new math implementation */
 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	switch (rr->rr_firstdatacol) {
 	case 1:
 		vdev_raidz_generate_parity_p(rr);
 		break;
 	case 2:
 		vdev_raidz_generate_parity_pq(rr);
 		break;
 	case 3:
 		vdev_raidz_generate_parity_pqr(rr);
 		break;
 	default:
 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 	}
 }
 
 void
 vdev_raidz_generate_parity(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		vdev_raidz_generate_parity_row(rm, rr);
 	}
 }
 
 static int
 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	int cnt = size / sizeof (src[0]);
 
 	for (int i = 0; i < cnt; i++) {
 		dst[i] ^= src[i];
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
     void *private)
 {
 	(void) private;
 	uint64_t *dst = dbuf;
 	uint64_t *src = sbuf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, src++) {
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 		*dst ^= *src;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 {
 	(void) private;
 	uint64_t *dst = buf;
 	uint64_t mask;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++) {
 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 		VDEV_RAIDZ_64MUL_2(*dst, mask);
 	}
 
 	return (0);
 }
 
 struct reconst_q_struct {
 	uint64_t *q;
 	int exp;
 };
 
 static int
 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 {
 	struct reconst_q_struct *rq = private;
 	uint64_t *dst = buf;
 	int cnt = size / sizeof (dst[0]);
 
 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 		int j;
 		uint8_t *b;
 
 		*dst ^= *rq->q;
 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 			*b = vdev_raidz_exp2(*b, rq->exp);
 		}
 	}
 
 	return (0);
 }
 
 struct reconst_pq_struct {
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *pxy;
 	uint8_t *qxy;
 	int aexp;
 	int bexp;
 };
 
 static int
 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 	uint8_t *yd = ybuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
 	}
 
 	return (0);
 }
 
 static int
 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 {
 	struct reconst_pq_struct *rpq = private;
 	uint8_t *xd = xbuf;
 
 	for (int i = 0; i < size;
 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
 
 	ASSERT3U(ntgts, ==, 1);
 	ASSERT3U(x, >=, rr->rr_firstdatacol);
 	ASSERT3U(x, <, rr->rr_cols);
 
 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
 
 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 
 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
 
 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 
 		if (c == x)
 			continue;
 
 		(void) abd_iterate_func2(dst, src, 0, 0, size,
 		    vdev_raidz_reconst_p_func, NULL);
 	}
 }
 
 static void
 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int x = tgts[0];
 	int c, exp;
 	abd_t *dst, *src;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
 
 	ASSERT(ntgts == 1);
 
 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
 
 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
 		    rr->rr_col[c].rc_size);
 
 		src = rr->rr_col[c].rc_abd;
 		dst = rr->rr_col[x].rc_abd;
 
 		if (c == rr->rr_firstdatacol) {
 			abd_copy(dst, src, size);
 			if (rr->rr_col[x].rc_size > size) {
 				abd_zero_off(dst, size,
 				    rr->rr_col[x].rc_size - size);
 			}
 		} else {
 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
 			(void) abd_iterate_func2(dst, src, 0, 0, size,
 			    vdev_raidz_reconst_q_pre_func, NULL);
 			(void) abd_iterate_func(dst,
 			    size, rr->rr_col[x].rc_size - size,
 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
 		}
 	}
 
 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	dst = rr->rr_col[x].rc_abd;
 	exp = 255 - (rr->rr_cols - 1 - x);
 
 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
 	    vdev_raidz_reconst_q_post_func, &rq);
 }
 
 static void
 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 	abd_t *pdata, *qdata;
 	uint64_t xsize, ysize;
 	int x = tgts[0];
 	int y = tgts[1];
 	abd_t *xd, *yd;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
 
 	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rr->rr_firstdatacol);
 	ASSERT(y < rr->rr_cols);
 
 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
 
 	/*
 	 * Move the parity data aside -- we're going to compute parity as
 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 	 * reuse the parity generation mechanism without trashing the actual
 	 * parity so we make those columns appear to be full of zeros by
 	 * setting their lengths to zero.
 	 */
 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
 	xsize = rr->rr_col[x].rc_size;
 	ysize = rr->rr_col[y].rc_size;
 
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 	rr->rr_col[x].rc_size = 0;
 	rr->rr_col[y].rc_size = 0;
 
 	vdev_raidz_generate_parity_pq(rr);
 
 	rr->rr_col[x].rc_size = xsize;
 	rr->rr_col[y].rc_size = ysize;
 
 	p = abd_to_buf(pdata);
 	q = abd_to_buf(qdata);
 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 	xd = rr->rr_col[x].rc_abd;
 	yd = rr->rr_col[y].rc_abd;
 
 	/*
 	 * We now have:
 	 *	Pxy = P + D_x + D_y
 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 	 *
 	 * We can then solve for D_x:
 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
 	 * where
 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 	 *
 	 * With D_x in hand, we can easily solve for D_y:
 	 *	D_y = P + Pxy + D_x
 	 */
 
 	a = vdev_raidz_pow2[255 + x - y];
 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
 	tmp = 255 - vdev_raidz_log2[a ^ 1];
 
 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 
 	ASSERT3U(xsize, >=, ysize);
 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 
 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 	    vdev_raidz_reconst_pq_func, &rpq);
 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
 	    vdev_raidz_reconst_pq_tail_func, &rpq);
 
 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
 
 	/*
 	 * Restore the saved parity data.
 	 */
 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 }
 
 /*
  * In the general case of reconstruction, we must solve the system of linear
  * equations defined by the coefficients used to generate parity as well as
  * the contents of the data and parity disks. This can be expressed with
  * vectors for the original data (D) and the actual data (d) and parity (p)
  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
  *
  *            __   __                     __     __
  *            |     |         __     __   |  p_0  |
  *            |  V  |         |  D_0  |   | p_m-1 |
  *            |     |    x    |   :   | = |  d_0  |
  *            |  I  |         | D_n-1 |   |   :   |
  *            |     |         ~~     ~~   | d_n-1 |
  *            ~~   ~~                     ~~     ~~
  *
  * I is simply a square identity matrix of size n, and V is a vandermonde
  * matrix defined by the coefficients we chose for the various parity columns
  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
  * computation as well as linear separability.
  *
  *      __               __               __     __
  *      |   1   ..  1 1 1 |               |  p_0  |
  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
  *      |   :       : : : |   |   :   |   |  d_2  |
  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
  *      |   0   ..  0 0 1 |               | d_n-1 |
  *      ~~               ~~               ~~     ~~
  *
  * Note that I, V, d, and p are known. To compute D, we must invert the
  * matrix and use the known data and parity values to reconstruct the unknown
  * data values. We begin by removing the rows in V|I and d|p that correspond
  * to failed or missing columns; we then make V|I square (n x n) and d|p
  * sized n by removing rows corresponding to unused parity from the bottom up
  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
  * using Gauss-Jordan elimination. In the example below we use m=3 parity
  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
  *           |  19 205 116  29  64  16  4   1  |      / /
  *           |  1   0   0   0   0   0   0   0  |     / /
  *           |  0   1   0   0   0   0   0   0  | <--' /
  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
  *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
  *           |  0   1   0   0   0   0   0   0  |
  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
  *           |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
  *           |  0   0   0   0   0   0   0   1  |
  *           ~~                               ~~
  *
  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
  * matrix is not singular.
  * __                                                                 __
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  * __                                                                 __
  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
  * ~~                                                                 ~~
  *                   __                               __
  *                   |  0   0   1   0   0   0   0   0  |
  *                   | 167 100  5   41 159 169 217 208 |
  *                   | 166 100  4   40 158 168 216 209 |
  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
  *                   |  0   0   0   0   1   0   0   0  |
  *                   |  0   0   0   0   0   1   0   0  |
  *                   |  0   0   0   0   0   0   1   0  |
  *                   |  0   0   0   0   0   0   0   1  |
  *                   ~~                               ~~
  *
  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
  * of the missing data.
  *
  * As is apparent from the example above, the only non-trivial rows in the
  * inverse matrix correspond to the data disks that we're trying to
  * reconstruct. Indeed, those are the only rows we need as the others would
  * only be useful for reconstructing data known or assumed to be valid. For
  * that reason, we only build the coefficients in the rows that correspond to
  * targeted columns.
  */
 
 static void
 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
     uint8_t **rows)
 {
 	int i, j;
 	int pow;
 
 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
 
 	/*
 	 * Fill in the missing rows of interest.
 	 */
 	for (i = 0; i < nmap; i++) {
 		ASSERT3S(0, <=, map[i]);
 		ASSERT3S(map[i], <=, 2);
 
 		pow = map[i] * n;
 		if (pow > 255)
 			pow -= 255;
 		ASSERT(pow <= 255);
 
 		for (j = 0; j < n; j++) {
 			pow -= map[i];
 			if (pow < 0)
 				pow += 255;
 			rows[i][j] = vdev_raidz_pow2[pow];
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, ii, jj;
 	uint8_t log;
 
 	/*
 	 * Assert that the first nmissing entries from the array of used
 	 * columns correspond to parity columns and that subsequent entries
 	 * correspond to data columns.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
 	}
 	for (; i < n; i++) {
 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
 	}
 
 	/*
 	 * First initialize the storage where we'll compute the inverse rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			invrows[i][j] = (i == j) ? 1 : 0;
 		}
 	}
 
 	/*
 	 * Subtract all trivial rows from the rows of consequence.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = nmissing; j < n; j++) {
 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
 			jj = used[j] - rr->rr_firstdatacol;
 			ASSERT3S(jj, <, n);
 			invrows[i][j] = rows[i][jj];
 			rows[i][jj] = 0;
 		}
 	}
 
 	/*
 	 * For each of the rows of interest, we must normalize it and subtract
 	 * a multiple of it from the other rows.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
 			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
 		/*
 		 * Compute the inverse of the first element and multiply each
 		 * element in the row by that value.
 		 */
 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 
 		for (j = 0; j < n; j++) {
 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 		}
 
 		for (ii = 0; ii < nmissing; ii++) {
 			if (i == ii)
 				continue;
 
 			ASSERT3U(rows[ii][missing[i]], !=, 0);
 
 			log = vdev_raidz_log2[rows[ii][missing[i]]];
 
 			for (j = 0; j < n; j++) {
 				rows[ii][j] ^=
 				    vdev_raidz_exp2(rows[i][j], log);
 				invrows[ii][j] ^=
 				    vdev_raidz_exp2(invrows[i][j], log);
 			}
 		}
 	}
 
 	/*
 	 * Verify that the data that is left in the rows are properly part of
 	 * an identity matrix.
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
 				ASSERT0(rows[i][j]);
 			}
 		}
 	}
 }
 
 static void
 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
     int *missing, uint8_t **invrows, const uint8_t *used)
 {
 	int i, j, x, cc, c;
 	uint8_t *src;
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
 	uint8_t log = 0;
 	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 
 	psize = sizeof (invlog[0][0]) * n * nmissing;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing; i++) {
 		invlog[i] = pp;
 		pp += n;
 	}
 
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < n; j++) {
 			ASSERT3U(invrows[i][j], !=, 0);
 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 		}
 	}
 
 	for (i = 0; i < n; i++) {
 		c = used[i];
 		ASSERT3U(c, <, rr->rr_cols);
 
 		ccount = rr->rr_col[c].rc_size;
 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
 		if (ccount == 0)
 			continue;
 		src = abd_to_buf(rr->rr_col[c].rc_abd);
 		for (j = 0; j < nmissing; j++) {
 			cc = missing[j] + rr->rr_firstdatacol;
 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
 			ASSERT3U(cc, <, rr->rr_cols);
 			ASSERT3U(cc, !=, c);
 
 			dcount[j] = rr->rr_col[cc].rc_size;
 			if (dcount[j] != 0)
 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
 		}
 
 		for (x = 0; x < ccount; x++, src++) {
 			if (*src != 0)
 				log = vdev_raidz_log2[*src];
 
 			for (cc = 0; cc < nmissing; cc++) {
 				if (x >= dcount[cc])
 					continue;
 
 				if (*src == 0) {
 					val = 0;
 				} else {
 					if ((ll = log + invlog[cc][i]) >= 255)
 						ll -= 255;
 					val = vdev_raidz_pow2[ll];
 				}
 
 				if (i == 0)
 					dst[cc][x] = val;
 				else
 					dst[cc][x] ^= val;
 			}
 		}
 	}
 
 	kmem_free(p, psize);
 }
 
 static void
 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
 {
 	int i, c, t, tt;
 	unsigned int n;
 	unsigned int nmissing_rows;
 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
 	int parity_map[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
 	size_t psize;
 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *used;
 
 	abd_t **bufs = NULL;
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
 	/*
 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
 	 * temporary linear ABDs if any non-linear ABDs are found.
 	 */
 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
 		ASSERT(rr->rr_col[i].rc_abd != NULL);
 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
 			    KM_PUSHPAGE);
 
 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 				raidz_col_t *col = &rr->rr_col[c];
 
 				bufs[c] = col->rc_abd;
 				if (bufs[c] != NULL) {
 					col->rc_abd = abd_alloc_linear(
 					    col->rc_size, B_TRUE);
 					abd_copy(col->rc_abd, bufs[c],
 					    col->rc_size);
 				}
 			}
 
 			break;
 		}
 	}
 
 	n = rr->rr_cols - rr->rr_firstdatacol;
 
 	/*
 	 * Figure out which data columns are missing.
 	 */
 	nmissing_rows = 0;
 	for (t = 0; t < ntgts; t++) {
 		if (tgts[t] >= rr->rr_firstdatacol) {
 			missing_rows[nmissing_rows++] =
 			    tgts[t] - rr->rr_firstdatacol;
 		}
 	}
 
 	/*
 	 * Figure out which parity columns to use to help generate the missing
 	 * data columns.
 	 */
 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
 		ASSERT(tt < ntgts);
 		ASSERT(c < rr->rr_firstdatacol);
 
 		/*
 		 * Skip any targeted parity columns.
 		 */
 		if (c == tgts[tt]) {
 			tt++;
 			continue;
 		}
 
 		parity_map[i] = c;
 		i++;
 	}
 
 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
 	    nmissing_rows * n + sizeof (used[0]) * n;
 	p = kmem_alloc(psize, KM_SLEEP);
 
 	for (pp = p, i = 0; i < nmissing_rows; i++) {
 		rows[i] = pp;
 		pp += n;
 		invrows[i] = pp;
 		pp += n;
 	}
 	used = pp;
 
 	for (i = 0; i < nmissing_rows; i++) {
 		used[i] = parity_map[i];
 	}
 
 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 		if (tt < nmissing_rows &&
 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
 			tt++;
 			continue;
 		}
 
 		ASSERT3S(i, <, n);
 		used[i] = c;
 		i++;
 	}
 
 	/*
 	 * Initialize the interesting rows of the matrix.
 	 */
 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
 
 	/*
 	 * Invert the matrix.
 	 */
 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
 	    invrows, used);
 
 	/*
 	 * Reconstruct the missing data using the generated matrix.
 	 */
 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
 	    invrows, used);
 
 	kmem_free(p, psize);
 
 	/*
 	 * copy back from temporary linear abds and free them
 	 */
 	if (bufs) {
 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *col = &rr->rr_col[c];
 
 			if (bufs[c] != NULL) {
 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
 				abd_free(col->rc_abd);
 			}
 			col->rc_abd = bufs[c];
 		}
 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
 	}
 }
 
 static void
 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
     const int *t, int nt)
 {
 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
 	int ntgts;
 	int i, c, ret;
 	int nbadparity, nbaddata;
 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
 
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
 		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
 		    (int)rr->rr_missingparity);
 	}
 
 	nbadparity = rr->rr_firstdatacol;
 	nbaddata = rr->rr_cols - nbadparity;
 	ntgts = 0;
 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
 		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
 			    "offset=%llx error=%u)",
 			    rr, c, (int)rr->rr_col[c].rc_devidx,
 			    (long long)rr->rr_col[c].rc_offset,
 			    (int)rr->rr_col[c].rc_error);
 		}
 		if (c < rr->rr_firstdatacol)
 			parity_valid[c] = B_FALSE;
 
 		if (i < nt && c == t[i]) {
 			tgts[ntgts++] = c;
 			i++;
 		} else if (rr->rr_col[c].rc_error != 0) {
 			tgts[ntgts++] = c;
 		} else if (c >= rr->rr_firstdatacol) {
 			nbaddata--;
 		} else {
 			parity_valid[c] = B_TRUE;
 			nbadparity--;
 		}
 	}
 
 	ASSERT(ntgts >= nt);
 	ASSERT(nbaddata >= 0);
 	ASSERT(nbaddata + nbadparity == ntgts);
 
 	dt = &tgts[nbadparity];
 
 	/* Reconstruct using the new math implementation */
 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
 	if (ret != RAIDZ_ORIGINAL_IMPL)
 		return;
 
 	/*
 	 * See if we can use any of our optimized reconstruction routines.
 	 */
 	switch (nbaddata) {
 	case 1:
 		if (parity_valid[VDEV_RAIDZ_P]) {
 			vdev_raidz_reconstruct_p(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_q(rr, dt, 1);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 		break;
 
 	case 2:
 		ASSERT(rr->rr_firstdatacol > 1);
 
 		if (parity_valid[VDEV_RAIDZ_P] &&
 		    parity_valid[VDEV_RAIDZ_Q]) {
 			vdev_raidz_reconstruct_pq(rr, dt, 2);
 			return;
 		}
 
 		ASSERT(rr->rr_firstdatacol > 2);
 
 		break;
 	}
 
 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
 }
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t nparity = vdrz->vd_nparity;
 	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	ASSERT(nparity > 0);
 
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
 	}
 	for (c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error != 0)
 			continue;
 		*physical_ashift = vdev_best_ashift(*logical_ashift,
 		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (vd->vdev_rz_expanding) {
 		*asize *= vd->vdev_children - 1;
 		*max_asize *= vd->vdev_children - 1;
 
 		vd->vdev_min_asize = *asize;
 	} else {
 		*asize *= vd->vdev_children;
 		*max_asize *= vd->vdev_children;
 	}
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	return (0);
 }
 
 static void
 vdev_raidz_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c] != NULL)
 			vdev_close(vd->vdev_child[c]);
 	}
 }
 
 /*
  * Return the logical width to use, given the txg in which the allocation
  * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
  * removal, see remap_blkptr_cb()), will have a more recent physical birth
  * which reflects when the BP was relocated, but we can ignore these because
  * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
 {
 	reflow_node_t lookup = {
 		.re_txg = txg,
 	};
 	avl_index_t where;
 
 	uint64_t width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
 	if (re != NULL) {
 		width = re->re_logical_width;
 	} else {
 		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
 		if (re != NULL)
 			width = re->re_logical_width;
 		else
 			width = vdrz->vd_original_width;
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 	return (width);
 }
+/*
+ * This code converts an asize into the largest psize that can safely be written
+ * to an allocation of that size for this vdev.
+ *
+ * Note that this function will not take into account the effect of gang
+ * headers, which also modify the ASIZE of the DVAs. It is purely a reverse of
+ * the psize_to_asize function.
+ */
+static uint64_t
+vdev_raidz_asize_to_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	uint64_t psize;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t cols = vdrz->vd_original_width;
+	uint64_t nparity = vdrz->vd_nparity;
+
+	cols = vdev_raidz_get_logical_width(vdrz, txg);
+
+	ASSERT0(asize % (1 << ashift));
+
+	psize = (asize >> ashift);
+	psize -= nparity * DIV_ROUND_UP(psize, cols);
+	psize <<= ashift;
+
+	return (asize);
+}
 
 /*
  * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
  * more space due to the lower data-to-parity ratio.  In this case it's
  * important to pass in the correct txg.  Note that vdev_gang_header_asize()
  * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
  * regardless of txg.  This is assured because for a single data sector, we
  * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
  */
 static uint64_t
-vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
+vdev_raidz_psize_to_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	uint64_t asize;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	uint64_t cols = vdrz->vd_original_width;
 	uint64_t nparity = vdrz->vd_nparity;
 
 	cols = vdev_raidz_get_logical_width(vdrz, txg);
 
 	asize = ((psize - 1) >> ashift) + 1;
 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 	asize = roundup(asize, nparity + 1) << ashift;
 
 #ifdef ZFS_DEBUG
 	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
 	uint64_t ncols_new = vdrz->vd_physical_width;
 	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
 	    (ncols_new - nparity));
 	asize_new = roundup(asize_new, nparity + 1) << ashift;
 	VERIFY3U(asize_new, <=, asize);
 #endif
 
 	return (asize);
 }
 
 /*
  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
  * so each child must provide at least 1/Nth of its asize.
  */
 static uint64_t
 vdev_raidz_min_asize(vdev_t *vd)
 {
 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
 	    vd->vdev_children);
 }
 
 void
 vdev_raidz_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	ASSERT3P(rc->rc_abd, !=, NULL);
 	rc->rc_error = zio->io_error;
 	rc->rc_tried = 1;
 	rc->rc_skipped = 0;
 }
 
 static void
 vdev_raidz_shadow_child_done(zio_t *zio)
 {
 	raidz_col_t *rc = zio->io_private;
 
 	rc->rc_shadow_error = zio->io_error;
 }
 
 static void
 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 {
 	(void) rm;
 #ifdef ZFS_DEBUG
 	zfs_range_seg64_t logical_rs, physical_rs, remain_rs;
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
-	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
+	    vdev_raidz_psize_to_asize(zio->io_vd, rr->rr_size,
 	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
 	ASSERT(vdev_xlate_is_empty(&remain_rs));
 	if (vdev_xlate_is_empty(&physical_rs)) {
 		/*
 		 * If we are in the middle of expansion, the
 		 * physical->logical mapping is changing so vdev_xlate()
 		 * can't give us a reliable answer.
 		 */
 		return;
 	}
 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
 	/*
 	 * It would be nice to assert that rs_end is equal
 	 * to rc_offset + rc_size but there might be an
 	 * optional I/O at the end that is not accounted in
 	 * rc_size.
 	 */
 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
 		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
 	} else {
 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
 	}
 #endif
 }
 
 static void
 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	raidz_map_t *rm = zio->io_vsd;
 
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 		/* Verify physical to logical translation */
 		vdev_raidz_io_verify(zio, rm, rr, c);
 
 		if (rc->rc_size == 0)
 			continue;
 
 		ASSERT3U(rc->rc_offset + rc->rc_size, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		ASSERT3P(rc->rc_abd, !=, NULL);
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    rc->rc_offset, rc->rc_abd,
 		    abd_get_size(rc->rc_abd), zio->io_type,
 		    zio->io_priority, 0, vdev_raidz_child_done, rc));
 
 		if (rc->rc_shadow_devidx != INT_MAX) {
 			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
 
 			ASSERT3U(
 			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
 			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
 			    rc->rc_shadow_offset, rc->rc_abd,
 			    abd_get_size(rc->rc_abd),
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_shadow_child_done, rc));
 		}
 	}
 }
 
 /*
  * Generate optional I/Os for skip sectors to improve aggregation contiguity.
  * This only works for vdev_raidz_map_alloc() (not _expanded()).
  */
 static void
 raidz_start_skip_writes(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	raidz_map_t *rm = zio->io_vsd;
 	ASSERT3U(rm->rm_nrows, ==, 1);
 	raidz_row_t *rr = rm->rm_row[0];
 	for (int c = 0; c < rr->rr_scols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (rc->rc_size != 0)
 			continue;
 		ASSERT3P(rc->rc_abd, ==, NULL);
 
 		ASSERT3U(rc->rc_offset, <,
 		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
 
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
 		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 	}
 }
 
 static void
 vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
 {
 	vdev_t *vd = zio->io_vd;
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
 	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_size == 0)
 			continue;
 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 		if (!vdev_readable(cvd)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rr->rr_firstdatacol)
 				rr->rr_missingdata++;
 			else
 				rr->rr_missingparity++;
 			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
 		if (forceparity ||
 		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
 			    vdev_raidz_child_done, rc));
 		}
 	}
 }
 
 static void
 vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
 {
 	vdev_t *vd = zio->io_vd;
 
 	for (int i = 0; i < rm->rm_nphys_cols; i++) {
 		raidz_col_t *prc = &rm->rm_phys_col[i];
 		if (prc->rc_size == 0)
 			continue;
 
 		ASSERT3U(prc->rc_devidx, ==, i);
 		vdev_t *cvd = vd->vdev_child[i];
 		if (!vdev_readable(cvd)) {
 			prc->rc_error = SET_ERROR(ENXIO);
 			prc->rc_tried = 1;	/* don't even try */
 			prc->rc_skipped = 1;
 			continue;
 		}
 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			prc->rc_error = SET_ERROR(ESTALE);
 			prc->rc_skipped = 1;
 			continue;
 		}
 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 		    prc->rc_offset, prc->rc_abd, prc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, prc));
 	}
 }
 
 static void
 vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
 {
 	/*
 	 * If there are multiple rows, we will be hitting
 	 * all disks, so go ahead and read the parity so
 	 * that we are reading in decent size chunks.
 	 */
 	boolean_t forceparity = rm->rm_nrows > 1;
 
 	if (rm->rm_phys_col) {
 		vdev_raidz_io_start_read_phys_cols(zio, rm);
 	} else {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_start_read_row(zio, rr, forceparity);
 		}
 	}
 }
 
 /*
  * Start an IO operation on a RAIDZ VDev
  *
  * Outline:
  * - For write operations:
  *   1. Generate the parity data
  *   2. Create child zio write operations to each column's vdev, for both
  *      data and parity.
  *   3. If the column skips any sectors for padding, create optional dummy
  *      write zio children for those areas to improve aggregation continuity.
  * - For read operations:
  *   1. Create child zio read operations to each data column's vdev to read
  *      the range of data required for zio.
  *   2. If this is a scrub or resilver operation, or if any of the data
  *      vdevs have had errors, then create zio read operations to the parity
  *      columns' VDevs as well.
  */
 static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
 	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
 		uint64_t next_offset = UINT64_MAX;
 		boolean_t use_scratch = B_FALSE;
 		/*
 		 * Note: when the expansion is completing, we set
 		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
 		 * in a later txg than when we last update spa_ubsync's state
 		 * (see the end of spa_raidz_expand_thread()).  Therefore we
 		 * may see vre_state!=SCANNING before
 		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
 		 * on disk, but the copying progress has been synced to disk
 		 * (and reflected in spa_ubsync).  In this case it's fine to
 		 * treat the expansion as completed, since if we crash there's
 		 * no additional copying to do.
 		 */
 		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
 			    &vdrz->vn_vre);
 			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
 			    zio->io_offset, zio->io_size, RL_READER);
 			use_scratch =
 			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
 			    RRSS_SCRATCH_VALID);
 			synced_offset =
 			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
 			next_offset = vdrz->vn_vre.vre_offset;
 			/*
 			 * If we haven't resumed expanding since importing the
 			 * pool, vre_offset won't have been set yet.  In
 			 * this case the next offset to be copied is the same
 			 * as what was synced.
 			 */
 			if (next_offset == UINT64_MAX) {
 				next_offset = synced_offset;
 			}
 		}
 		if (use_scratch) {
 			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
 			    "%lld next_offset=%lld use_scratch=%u",
 			    zio,
 			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
 			    (long long)zio->io_offset,
 			    (long long)synced_offset,
 			    (long long)next_offset,
 			    use_scratch);
 		}
 
 		rm = vdev_raidz_map_alloc_expanded(zio,
 		    tvd->vdev_ashift, vdrz->vd_physical_width,
 		    logical_width, vdrz->vd_nparity,
 		    synced_offset, next_offset, use_scratch);
 		rm->rm_lr = lr;
 	} else {
 		rm = vdev_raidz_map_alloc(zio,
 		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
 	}
 	rm->rm_original_width = vdrz->vd_original_width;
 
 	zio->io_vsd = rm;
 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
 		}
 
 		if (logical_width == vdrz->vd_physical_width) {
 			raidz_start_skip_writes(zio);
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_READ);
 		vdev_raidz_io_start_read(zio, rm);
 	}
 
 	zio_execute(zio);
 }
 
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
 void
 vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 		zio_bad_cksum_t zbc;
 		raidz_map_t *rm = zio->io_vsd;
 
 		zbc.zbc_has_cksum = 0;
 		zbc.zbc_injected = rm->rm_ecksuminjected;
 
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
 		    rc->rc_abd, bad_data, &zbc);
 	}
 }
 
 /*
  * We keep track of whether or not there were any injected errors, so that
  * any ereports we generate can note it.
  */
 static int
 raidz_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t zbc = {0};
 	raidz_map_t *rm = zio->io_vsd;
 
 	int ret = zio_checksum_error(zio, &zbc);
 	/*
 	 * Any Direct I/O read that has a checksum error must be treated as
 	 * suspicious as the contents of the buffer could be getting
 	 * manipulated while the I/O is taking place. The checksum verify error
 	 * will be reported to the top-level RAIDZ VDEV.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_READ && ret == ECKSUM) {
 		zio->io_error = ret;
 		zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 		zio_dio_chksum_verify_error_report(zio);
 		zio_checksum_verified(zio);
 		return (0);
 	}
 
 	if (ret != 0 && zbc.zbc_injected != 0)
 		rm->rm_ecksuminjected = 1;
 
 	return (ret);
 }
 
 /*
  * Generate the parity from the data columns. If we tried and were able to
  * read the parity without error, verify that the generated parity matches the
  * data we read. If it doesn't, we fire off a checksum error. Return the
  * number of such failures.
  */
 static int
 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
 {
 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
 	int c, ret = 0;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
 
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 
 	if (checksum == ZIO_CHECKSUM_NOPARITY)
 		return (ret);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		orig[c] = rc->rc_abd;
 		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
 		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
 	}
 
 	/*
 	 * Verify any empty sectors are zero filled to ensure the parity
 	 * is calculated correctly even if these non-data sectors are damaged.
 	 */
 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
 		ret += vdev_draid_map_verify_empty(zio, rr);
 
 	/*
 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
 	 * isn't harmful but it does have the side effect of fixing stuff
 	 * we didn't realize was necessary (i.e. even if we return 0).
 	 */
 	vdev_raidz_generate_parity_row(rm, rr);
 
 	for (c = 0; c < rr->rr_firstdatacol; c++) {
 		rc = &rr->rr_col[c];
 
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 
 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
 			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
 			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
 			vdev_raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		abd_free(orig[c]);
 	}
 
 	return (ret);
 }
 
 static int
 vdev_raidz_worst_error(raidz_row_t *rr)
 {
 	int error = 0;
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
 		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
 	}
 
 	return (error);
 }
 
 static void
 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
 {
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			if (!rc->rc_skipped)
 				unexpected_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 
 		if (rc->rc_force_repair)
 			unexpected_errors++;
 	}
 
 	/*
 	 * If we read more parity disks than were used for
 	 * reconstruction, confirm that the other parity disks produced
 	 * correct data.
 	 *
 	 * Note that we also regenerate parity when resilvering so we
 	 * can write it out to failed devices later.
 	 */
 	if (parity_errors + parity_untried <
 	    rr->rr_firstdatacol - data_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 		int n = raidz_parity_verify(zio, rr);
 		unexpected_errors += n;
 	}
 
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
 		 */
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
 
 			if (!rc->rc_allow_repair) {
 				continue;
 			} else if (!rc->rc_force_repair &&
 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
 				continue;
 			}
 			/*
 			 * We do not allow self healing for Direct I/O reads.
 			 * See comment in vdev_raid_row_alloc().
 			 */
 			ASSERT0(zio->io_flags & ZIO_FLAG_DIO_READ);
 
 			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
 			    "offset=%llx",
 			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE,
 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 
 	/*
 	 * Scrub or resilver i/o's: overwrite any shadow locations with the
 	 * good data.  This ensures that if we've already copied this sector,
 	 * it will be corrected if it was damaged.  This writes more than is
 	 * necessary, but since expansion is paused during scrub/resilver, at
 	 * most a single row will have a shadow location.
 	 */
 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *vd = zio->io_vd;
 
 			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
 				continue;
 			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
 
 			/*
 			 * Note: We don't want to update the repair stats
 			 * because that would incorrectly indicate that there
 			 * was bad data to repair, which we aren't sure about.
 			 * By clearing the SCAN_THREAD flag, we prevent this
 			 * from happening, despite having the REPAIR flag set.
 			 * We need to set SELF_HEAL so that this i/o can't be
 			 * bypassed by zio_vdev_io_start().
 			 */
 			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
 			    NULL, NULL);
 			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
 			zio_nowait(cio);
 		}
 	}
 }
 
 static void
 raidz_restore_orig_data(raidz_map_t *rm)
 {
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_need_orig_restore) {
 				abd_copy(rc->rc_abd,
 				    rc->rc_orig_data, rc->rc_size);
 				rc->rc_need_orig_restore = B_FALSE;
 			}
 		}
 	}
 }
 
 /*
  * During raidz_reconstruct() for expanded VDEV, we need special consideration
  * failure simulations.  See note in raidz_reconstruct() on simulating failure
  * of a pre-expansion device.
  *
  * Treating logical child i as failed, return TRUE if the given column should
  * be treated as failed.  The idea of logical children allows us to imagine
  * that a disk silently failed before a RAIDZ expansion (reads from this disk
  * succeed but return the wrong data).  Since the expansion doesn't verify
  * checksums, the incorrect data will be moved to new locations spread among
  * the children (going diagonally across them).
  *
  * Higher "logical child failures" (values of `i`) indicate these
  * "pre-expansion failures".  The first physical_width values imagine that a
  * current child failed; the next physical_width-1 values imagine that a
  * child failed before the most recent expansion; the next physical_width-2
  * values imagine a child failed in the expansion before that, etc.
  */
 static boolean_t
 raidz_simulate_failure(int physical_width, int original_width, int ashift,
     int i, raidz_col_t *rc)
 {
 	uint64_t sector_id =
 	    physical_width * (rc->rc_offset >> ashift) +
 	    rc->rc_devidx;
 
 	for (int w = physical_width; w >= original_width; w--) {
 		if (i < w) {
 			return (sector_id % w == i);
 		} else {
 			i -= w;
 		}
 	}
 	ASSERT(!"invalid logical child id");
 	return (B_FALSE);
 }
 
 /*
  * returns EINVAL if reconstruction of the block will not be possible
  * returns ECKSUM if this specific reconstruction failed
  * returns 0 on successful reconstruction
  */
 static int
 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
 {
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
 
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
 		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
 	}
 
 	/* Reconstruct each row */
 	for (int r = 0; r < rm->rm_nrows; r++) {
 		raidz_row_t *rr = rm->rm_row[r];
 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
 		int t = 0;
 		int dead = 0;
 		int dead_data = 0;
 
 		if (dbgmsg)
 			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			ASSERT0(rc->rc_need_orig_restore);
 			if (rc->rc_error != 0) {
 				dead++;
 				if (c >= nparity)
 					dead_data++;
 				continue;
 			}
 			if (rc->rc_size == 0)
 				continue;
 			for (int lt = 0; lt < ntgts; lt++) {
 				if (raidz_simulate_failure(physical_width,
 				    original_width,
 				    zio->io_vd->vdev_top->vdev_ashift,
 				    ltgts[lt], rc)) {
 					if (rc->rc_orig_data == NULL) {
 						rc->rc_orig_data =
 						    abd_alloc_linear(
 						    rc->rc_size, B_TRUE);
 						abd_copy(rc->rc_orig_data,
 						    rc->rc_abd, rc->rc_size);
 					}
 					rc->rc_need_orig_restore = B_TRUE;
 
 					dead++;
 					if (c >= nparity)
 						dead_data++;
 					/*
 					 * Note: simulating failure of a
 					 * pre-expansion device can hit more
 					 * than one column, in which case we
 					 * might try to simulate more failures
 					 * than can be reconstructed, which is
 					 * also more than the size of my_tgts.
 					 * This check prevents accessing past
 					 * the end of my_tgts.  The "dead >
 					 * nparity" check below will fail this
 					 * reconstruction attempt.
 					 */
 					if (t < VDEV_RAIDZ_MAXPARITY) {
 						my_tgts[t++] = c;
 						if (dbgmsg) {
 							zfs_dbgmsg("simulating "
 							    "failure of col %u "
 							    "devidx %u", c,
 							    (int)rc->rc_devidx);
 						}
 					}
 					break;
 				}
 			}
 		}
 		if (dead > nparity) {
 			/* reconstruction not possible */
 			if (dbgmsg) {
 				zfs_dbgmsg("reconstruction not possible; "
 				    "too many failures");
 			}
 			raidz_restore_orig_data(rm);
 			return (EINVAL);
 		}
 		if (dead_data > 0)
 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
 	}
 
 	/* Check for success */
 	if (raidz_checksum_verify(zio) == 0) {
 		if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 			return (0);
 
 		/* Reconstruction succeeded - report errors */
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 
 			for (int c = 0; c < rr->rr_cols; c++) {
 				raidz_col_t *rc = &rr->rr_col[c];
 				if (rc->rc_need_orig_restore) {
 					/*
 					 * Note: if this is a parity column,
 					 * we don't really know if it's wrong.
 					 * We need to let
 					 * vdev_raidz_io_done_verified() check
 					 * it, and if we set rc_error, it will
 					 * think that it is a "known" error
 					 * that doesn't need to be checked
 					 * or corrected.
 					 */
 					if (rc->rc_error == 0 &&
 					    c >= rr->rr_firstdatacol) {
 						vdev_raidz_checksum_error(zio,
 						    rc, rc->rc_orig_data);
 						rc->rc_error =
 						    SET_ERROR(ECKSUM);
 					}
 					rc->rc_need_orig_restore = B_FALSE;
 				}
 			}
 
 			vdev_raidz_io_done_verified(zio, rr);
 		}
 
 		zio_checksum_verified(zio);
 
 		if (dbgmsg) {
 			zfs_dbgmsg("reconstruction successful "
 			    "(checksum verified)");
 		}
 		return (0);
 	}
 
 	/* Reconstruction failed - restore original data */
 	raidz_restore_orig_data(rm);
 	if (dbgmsg) {
 		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
 		    "failed", zio);
 	}
 	return (ECKSUM);
 }
 
 /*
  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
  * Note that the algorithm below is non-optimal because it doesn't take into
  * account how reconstruction is actually performed. For example, with
  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
  * is targeted as invalid as if columns 1 and 4 are targeted since in both
  * cases we'd only use parity information in column 0.
  *
  * The order that we find the various possible combinations of failed
  * disks is dictated by these rules:
  * - Examine each "slot" (the "i" in tgts[i])
  *   - Try to increment this slot (tgts[i] += 1)
  *   - if we can't increment because it runs into the next slot,
  *     reset our slot to the minimum, and examine the next slot
  *
  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
  *  3 columns to reconstruct), we will generate the following sequence:
  *
  *  STATE        ACTION
  *  0 1 2        special case: skip since these are all parity
  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
  *  0   2 3      first slot: increment to 1
  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
  *  0 1     4    first: reset to 0; middle: increment to 2
  *  0   2   4    first: increment to 1
  *    1 2   4    first: reset to 0; middle: increment to 3
  *  0     3 4    first: increment to 1
  *    1   3 4    first: increment to 2
  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
  *  0 1       5  first: reset to 0; middle: increment to 2
  *  0   2     5  first: increment to 1
  *    1 2     5  first: reset to 0; middle: increment to 3
  *  0     3   5  first: increment to 1
  *    1   3   5  first: increment to 2
  *      2 3   5  first: reset to 0; middle: increment to 4
  *  0       4 5  first: increment to 1
  *    1     4 5  first: increment to 2
  *      2   4 5  first: increment to 3
  *        3 4 5  done
  *
  * This strategy works for dRAID but is less efficient when there are a large
  * number of child vdevs and therefore permutations to check. Furthermore,
  * since the raidz_map_t rows likely do not overlap, reconstruction would be
  * possible as long as there are no more than nparity data errors per row.
  * These additional permutations are not currently checked but could be as
  * a future improvement.
  *
  * Returns 0 on success, ECKSUM on failure.
  */
 static int
 vdev_raidz_combrec(zio_t *zio)
 {
 	int nparity = vdev_get_nparity(zio->io_vd);
 	raidz_map_t *rm = zio->io_vsd;
 	int physical_width = zio->io_vd->vdev_children;
 	int original_width = (rm->rm_original_width != 0) ?
 	    rm->rm_original_width : physical_width;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 		int total_errors = 0;
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			if (rr->rr_col[c].rc_error)
 				total_errors++;
 		}
 
 		if (total_errors > nparity)
 			return (vdev_raidz_worst_error(rr));
 	}
 
 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
 		int *ltgts = &tstore[1]; /* value is logical child ID */
 
 
 		/*
 		 * Determine number of logical children, n.  See comment
 		 * above raidz_simulate_failure().
 		 */
 		int n = 0;
 		for (int w = physical_width;
 		    w >= original_width; w--) {
 			n += w;
 		}
 
 		ASSERT3U(num_failures, <=, nparity);
 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
 
 		/* Handle corner cases in combrec logic */
 		ltgts[-1] = -1;
 		for (int i = 0; i < num_failures; i++) {
 			ltgts[i] = i;
 		}
 		ltgts[num_failures] = n;
 
 		for (;;) {
 			int err = raidz_reconstruct(zio, ltgts, num_failures,
 			    nparity);
 			if (err == EINVAL) {
 				/*
 				 * Reconstruction not possible with this #
 				 * failures; try more failures.
 				 */
 				break;
 			} else if (err == 0)
 				return (0);
 
 			/* Compute next targets to try */
 			for (int t = 0; ; t++) {
 				ASSERT3U(t, <, num_failures);
 				ltgts[t]++;
 				if (ltgts[t] == n) {
 					/* try more failures */
 					ASSERT3U(t, ==, num_failures - 1);
 					if (zfs_flags &
 					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
 						zfs_dbgmsg("reconstruction "
 						    "failed for num_failures="
 						    "%u; tried all "
 						    "combinations",
 						    num_failures);
 					}
 					break;
 				}
 
 				ASSERT3U(ltgts[t], <, n);
 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
 
 				/*
 				 * If that spot is available, we're done here.
 				 * Try the next combination.
 				 */
 				if (ltgts[t] != ltgts[t + 1])
 					break; // found next combination
 
 				/*
 				 * Otherwise, reset this tgt to the minimum,
 				 * and move on to the next tgt.
 				 */
 				ltgts[t] = ltgts[t - 1] + 1;
 				ASSERT3U(ltgts[t], ==, t);
 			}
 
 			/* Increase the number of failures and keep trying. */
 			if (ltgts[num_failures - 1] == n)
 				break;
 		}
 	}
 	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
 		zfs_dbgmsg("reconstruction failed for all num_failures");
 	return (ECKSUM);
 }
 
 void
 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
 {
 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
 		raidz_row_t *rr = rm->rm_row[row];
 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
 	}
 }
 
 /*
  * Complete a write IO operation on a RAIDZ VDev
  *
  * Outline:
  *   1. Check for errors on the child IOs.
  *   2. Return, setting an error code if too few child VDevs were written
  *      to reconstruct the data later.  Note that partial writes are
  *      considered successful if they can be reconstructed at all.
  */
 static void
 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
 {
 	int normal_errors = 0;
 	int shadow_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		if (rc->rc_error != 0) {
 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
 			normal_errors++;
 		}
 		if (rc->rc_shadow_error != 0) {
 			ASSERT(rc->rc_shadow_error != ECKSUM);
 			shadow_errors++;
 		}
 	}
 
 	/*
 	 * Treat partial writes as a success. If we couldn't write enough
 	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
 	 * enough.  Note that in the case of a shadow write (during raidz
 	 * expansion), depending on if we crash, either the normal (old) or
 	 * shadow (new) location may become the "real" version of the block,
 	 * so both locations must have sufficient redundancy.
 	 *
 	 * Now that we support write reallocation, it would be better
 	 * to treat partial failure as real failure unless there are
 	 * no non-degraded top-level vdevs left, and not update DTLs
 	 * if we intend to reallocate.
 	 */
 	if (normal_errors > rr->rr_firstdatacol ||
 	    shadow_errors > rr->rr_firstdatacol) {
 		zio->io_error = zio_worst_error(zio->io_error,
 		    vdev_raidz_worst_error(rr));
 	}
 }
 
 static void
 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
     raidz_row_t *rr)
 {
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
 
 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 
 		/*
 		 * If scrubbing and a replacing/sparing child vdev determined
 		 * that not all of its children have an identical copy of the
 		 * data, then clear the error so the column is treated like
 		 * any other read and force a repair to correct the damage.
 		 */
 		if (rc->rc_error == ECKSUM) {
 			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
 			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
 			rc->rc_force_repair = 1;
 			rc->rc_error = 0;
 		}
 
 		if (rc->rc_error) {
 			if (c < rr->rr_firstdatacol)
 				parity_errors++;
 			else
 				data_errors++;
 
 			total_errors++;
 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
 			parity_untried++;
 		}
 	}
 
 	/*
 	 * If there were data errors and the number of errors we saw was
 	 * correctable -- less than or equal to the number of parity disks read
 	 * -- reconstruct based on the missing data.
 	 */
 	if (data_errors != 0 &&
 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
 		/*
 		 * We either attempt to read all the parity columns or
 		 * none of them. If we didn't try to read parity, we
 		 * wouldn't be here in the correctable case. There must
 		 * also have been fewer parity errors than parity
 		 * columns or, again, we wouldn't be in this code path.
 		 */
 		ASSERT(parity_untried == 0);
 		ASSERT(parity_errors < rr->rr_firstdatacol);
 
 		/*
 		 * Identify the data columns that reported an error.
 		 */
 		int n = 0;
 		int tgts[VDEV_RAIDZ_MAXPARITY];
 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			if (rc->rc_error != 0) {
 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
 				tgts[n++] = c;
 			}
 		}
 
 		ASSERT(rr->rr_firstdatacol >= n);
 
 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
 	}
 }
 
 /*
  * Return the number of reads issued.
  */
 static int
 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
 {
 	vdev_t *vd = zio->io_vd;
 	int nread = 0;
 
 	rr->rr_missingdata = 0;
 	rr->rr_missingparity = 0;
 
 	/*
 	 * If this rows contains empty sectors which are not required
 	 * for a normal read then allocate an ABD for them now so they
 	 * may be read, verified, and any needed repairs performed.
 	 */
 	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
 		vdev_draid_map_alloc_empty(zio, rr);
 
 	for (int c = 0; c < rr->rr_cols; c++) {
 		raidz_col_t *rc = &rr->rr_col[c];
 		if (rc->rc_tried || rc->rc_size == 0)
 			continue;
 
 		zio_nowait(zio_vdev_child_io(zio, NULL,
 		    vd->vdev_child[rc->rc_devidx],
 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
 		    zio->io_type, zio->io_priority, 0,
 		    vdev_raidz_child_done, rc));
 		nread++;
 	}
 	return (nread);
 }
 
 /*
  * We're here because either there were too many errors to even attempt
  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
  * failed. In either case, there is enough bad data to prevent reconstruction.
  * Start checksum ereports for all children which haven't failed.
  */
 static void
 vdev_raidz_io_done_unrecoverable(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	for (int i = 0; i < rm->rm_nrows; i++) {
 		raidz_row_t *rr = rm->rm_row[i];
 
 		for (int c = 0; c < rr->rr_cols; c++) {
 			raidz_col_t *rc = &rr->rr_col[c];
 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 			if (rc->rc_error != 0)
 				continue;
 
 			zio_bad_cksum_t zbc;
 			zbc.zbc_has_cksum = 0;
 			zbc.zbc_injected = rm->rm_ecksuminjected;
 			mutex_enter(&cvd->vdev_stat_lock);
 			cvd->vdev_stat.vs_checksum_errors++;
 			mutex_exit(&cvd->vdev_stat_lock);
 			(void) zfs_ereport_start_checksum(zio->io_spa,
 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
 			    rc->rc_size, &zbc);
 		}
 	}
 }
 
 void
 vdev_raidz_io_done(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
 	ASSERT(zio->io_bp != NULL);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
 		}
 	} else {
 		if (rm->rm_phys_col) {
 			/*
 			 * This is an aggregated read.  Copy the data and status
 			 * from the aggregate abd's to the individual rows.
 			 */
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 
 				for (int c = 0; c < rr->rr_cols; c++) {
 					raidz_col_t *rc = &rr->rr_col[c];
 					if (rc->rc_tried || rc->rc_size == 0)
 						continue;
 
 					raidz_col_t *prc =
 					    &rm->rm_phys_col[rc->rc_devidx];
 					rc->rc_error = prc->rc_error;
 					rc->rc_tried = prc->rc_tried;
 					rc->rc_skipped = prc->rc_skipped;
 					if (c >= rr->rr_firstdatacol) {
 						/*
 						 * Note: this is slightly faster
 						 * than using abd_copy_off().
 						 */
 						char *physbuf = abd_to_buf(
 						    prc->rc_abd);
 						void *physloc = physbuf +
 						    rc->rc_offset -
 						    prc->rc_offset;
 
 						abd_copy_from_buf(rc->rc_abd,
 						    physloc, rc->rc_size);
 					}
 				}
 			}
 		}
 
 		for (int i = 0; i < rm->rm_nrows; i++) {
 			raidz_row_t *rr = rm->rm_row[i];
 			vdev_raidz_io_done_reconstruct_known_missing(zio,
 			    rm, rr);
 		}
 
 		if (raidz_checksum_verify(zio) == 0) {
 			if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 				goto done;
 
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				raidz_row_t *rr = rm->rm_row[i];
 				vdev_raidz_io_done_verified(zio, rr);
 			}
 			zio_checksum_verified(zio);
 		} else {
 			/*
 			 * A sequential resilver has no checksum which makes
 			 * combinatoral reconstruction impossible. This code
 			 * path is unreachable since raidz_checksum_verify()
 			 * has no checksum to verify and must succeed.
 			 */
 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
 
 			/*
 			 * This isn't a typical situation -- either we got a
 			 * read error or a child silently returned bad data.
 			 * Read every block so we can try again with as much
 			 * data and parity as we can track down. If we've
 			 * already been through once before, all children will
 			 * be marked as tried so we'll proceed to combinatorial
 			 * reconstruction.
 			 */
 			int nread = 0;
 			for (int i = 0; i < rm->rm_nrows; i++) {
 				nread += vdev_raidz_read_all(zio,
 				    rm->rm_row[i]);
 			}
 			if (nread != 0) {
 				/*
 				 * Normally our stage is VDEV_IO_DONE, but if
 				 * we've already called redone(), it will have
 				 * changed to VDEV_IO_START, in which case we
 				 * don't want to call redone() again.
 				 */
 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
 					zio_vdev_io_redone(zio);
 				return;
 			}
 			/*
 			 * It would be too expensive to try every possible
 			 * combination of failed sectors in every row, so
 			 * instead we try every combination of failed current or
 			 * past physical disk. This means that if the incorrect
 			 * sectors were all on Nparity disks at any point in the
 			 * past, we will find the correct data.  The only known
 			 * case where this is less durable than a non-expanded
 			 * RAIDZ, is if we have a silent failure during
 			 * expansion.  In that case, one block could be
 			 * partially in the old format and partially in the
 			 * new format, so we'd lost some sectors from the old
 			 * format and some from the new format.
 			 *
 			 * e.g. logical_width=4 physical_width=6
 			 * the 15 (6+5+4) possible failed disks are:
 			 * width=6 child=0
 			 * width=6 child=1
 			 * width=6 child=2
 			 * width=6 child=3
 			 * width=6 child=4
 			 * width=6 child=5
 			 * width=5 child=0
 			 * width=5 child=1
 			 * width=5 child=2
 			 * width=5 child=3
 			 * width=5 child=4
 			 * width=4 child=0
 			 * width=4 child=1
 			 * width=4 child=2
 			 * width=4 child=3
 			 * And we will try every combination of Nparity of these
 			 * failing.
 			 *
 			 * As a first pass, we can generate every combo,
 			 * and try reconstructing, ignoring any known
 			 * failures.  If any row has too many known + simulated
 			 * failures, then we bail on reconstructing with this
 			 * number of simulated failures.  As an improvement,
 			 * we could detect the number of whole known failures
 			 * (i.e. we have known failures on these disks for
 			 * every row; the disks never succeeded), and
 			 * subtract that from the max # failures to simulate.
 			 * We could go even further like the current
 			 * combrec code, but that doesn't seem like it
 			 * gains us very much.  If we simulate a failure
 			 * that is also a known failure, that's fine.
 			 */
 			zio->io_error = vdev_raidz_combrec(zio);
 			if (zio->io_error == ECKSUM &&
 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 				vdev_raidz_io_done_unrecoverable(zio);
 			}
 		}
 	}
 done:
 	if (rm->rm_lr != NULL) {
 		zfs_rangelock_exit(rm->rm_lr);
 		rm->rm_lr = NULL;
 	}
 }
 
 static void
 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (faulted > vdrz->vd_nparity)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded + faulted != 0)
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	else
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
 /*
  * Determine if any portion of the provided block resides on a child vdev
  * with a dirty DTL and therefore needs to be resilvered.  The function
  * assumes that at least one DTL is dirty which implies that full stripe
  * width blocks must be resilvered.
  */
 static boolean_t
 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * If we're in the middle of a RAIDZ expansion, this block may be in
 	 * the old and/or new location.  For simplicity, always resilver it.
 	 */
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
 		return (B_TRUE);
 
 	uint64_t dcols = vd->vdev_children;
 	uint64_t nparity = vdrz->vd_nparity;
 	uint64_t ashift = vd->vdev_top->vdev_ashift;
 	/* The starting RAIDZ (parent) vdev sector of the block. */
 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
 	/* The zio's size in units of the vdev's minimum sector size. */
 	uint64_t s = ((psize - 1) >> ashift) + 1;
 	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
 
 	/* Unreachable by sequential resilver. */
 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
 
 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
 		return (B_FALSE);
 
 	if (s + nparity >= dcols)
 		return (B_TRUE);
 
 	for (uint64_t c = 0; c < s + nparity; c++) {
 		uint64_t devidx = (f + c) % dcols;
 		vdev_t *cvd = vd->vdev_child[devidx];
 
 		/*
 		 * dsl_scan_need_resilver() already checked vd with
 		 * vdev_dtl_contains(). So here just check cvd with
 		 * vdev_dtl_empty(), cheaper and a good approximation.
 		 */
 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 static void
 vdev_raidz_xlate(vdev_t *cvd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	(void) remain_rs;
 
 	vdev_t *raidvd = cvd->vdev_parent;
 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
 
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		/*
 		 * We're in the middle of expansion, in which case the
 		 * translation is in flux.  Any answer we give may be wrong
 		 * by the time we return, so it isn't safe for the caller to
 		 * act on it.  Therefore we say that this range isn't present
 		 * on any children.  The only consumers of this are "zpool
 		 * initialize" and trimming, both of which are "best effort"
 		 * anyway.
 		 */
 		physical_rs->rs_start = physical_rs->rs_end = 0;
 		remain_rs->rs_start = remain_rs->rs_end = 0;
 		return;
 	}
 
 	uint64_t width = vdrz->vd_physical_width;
 	uint64_t tgt_col = cvd->vdev_id;
 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
 
 	/* make sure the offsets are block-aligned */
 	ASSERT0(logical_rs->rs_start % (1 << ashift));
 	ASSERT0(logical_rs->rs_end % (1 << ashift));
 	uint64_t b_start = logical_rs->rs_start >> ashift;
 	uint64_t b_end = logical_rs->rs_end >> ashift;
 
 	uint64_t start_row = 0;
 	if (b_start > tgt_col) /* avoid underflow */
 		start_row = ((b_start - tgt_col - 1) / width) + 1;
 
 	uint64_t end_row = 0;
 	if (b_end > tgt_col)
 		end_row = ((b_end - tgt_col - 1) / width) + 1;
 
 	physical_rs->rs_start = start_row << ashift;
 	physical_rs->rs_end = end_row << ashift;
 
 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
 	    logical_rs->rs_end - logical_rs->rs_start);
 }
 
 static void
 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	/*
 	 * Ensure there are no i/os to the range that is being committed.
 	 */
 	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
 
 	mutex_enter(&vre->vre_lock);
 	uint64_t new_offset =
 	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
 	/*
 	 * We should not have committed anything that failed.
 	 */
 	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
 	mutex_exit(&vre->vre_lock);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    old_offset, new_offset - old_offset,
 	    RL_WRITER);
 
 	/*
 	 * Update the uberblock that will be written when this txg completes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
 	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
 	vre->vre_offset_pertxg[txgoff] = 0;
 	zfs_rangelock_exit(lr);
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
 	vre->vre_bytes_copied_pertxg[txgoff] = 0;
 	mutex_exit(&vre->vre_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
 }
 
 static void
 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 
 	for (int i = 0; i < TXG_SIZE; i++)
 		VERIFY0(vre->vre_offset_pertxg[i]);
 
 	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
 	re->re_logical_width = vdrz->vd_physical_width;
 	mutex_enter(&vdrz->vd_expand_lock);
 	avl_add(&vdrz->vd_expand_txgs, re);
 	mutex_exit(&vdrz->vd_expand_lock);
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	/*
 	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
 	 * will get written (based on vd_expand_txgs).
 	 */
 	vdev_config_dirty(vd);
 
 	/*
 	 * Before we change vre_state, the on-disk state must reflect that we
 	 * have completed all copying, so that vdev_raidz_io_start() can use
 	 * vre_state to determine if the reflow is in progress.  See also the
 	 * end of spa_raidz_expand_thread().
 	 */
 	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
 	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
 
 	vre->vre_end_time = gethrestime_sec();
 	vre->vre_state = DSS_FINISHED;
 
 	uint64_t state = vre->vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t end_time = vre->vre_end_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 	    sizeof (end_time), 1, &end_time, tx));
 
 	spa->spa_uberblock.ub_raidz_reflow_info = 0;
 
 	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)vd->vdev_id,
 	    (unsigned long long)vd->vdev_children);
 
 	spa->spa_raidz_expand = NULL;
 	raidvd->vdev_rz_expanding = B_FALSE;
 
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	spa_notify_waiters(spa);
 
 	/*
 	 * While we're in syncing context take the opportunity to
 	 * setup a scrub. All the data has been sucessfully copied
 	 * but we have not validated any checksums.
 	 */
 	setup_sync_arg_t setup_sync_arg = {
 		.func = POOL_SCAN_SCRUB,
 		.txgstart = 0,
 		.txgend = 0,
 	};
 	if (zfs_scrub_after_expand &&
 	    dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0) {
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 }
 
 /*
  * State of one copy batch.
  */
 typedef struct raidz_reflow_arg {
 	vdev_raidz_expand_t *rra_vre;	/* Global expantion state. */
 	zfs_locked_range_t *rra_lr;	/* Range lock of this batch. */
 	uint64_t rra_txg;	/* TXG of this batch. */
 	uint_t rra_ashift;	/* Ashift of the vdev. */
 	uint32_t rra_tbd;	/* Number of in-flight ZIOs. */
 	uint32_t rra_writes;	/* Number of write ZIOs. */
 	zio_t *rra_zio[];	/* Write ZIO pointers. */
 } raidz_reflow_arg_t;
 
 /*
  * Write of the new location on one child is done.  Once all of them are done
  * we can unlock and free everything.
  */
 static void
 raidz_reflow_write_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vre->vre_lock);
 	if (zio->io_error != 0) {
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 	}
 	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
 	vre->vre_outstanding_bytes -= zio->io_size;
 	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
 	    vre->vre_failed_offset) {
 		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
 		    zio->io_size;
 	}
 	cv_signal(&vre->vre_cv);
 	boolean_t done = (--rra->rra_tbd == 0);
 	mutex_exit(&vre->vre_lock);
 
 	if (!done)
 		return;
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 	zfs_rangelock_exit(rra->rra_lr);
 	kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * rra->rra_writes);
 }
 
 /*
  * Read of the old location on one child is done.  Once all of them are done
  * writes should have all the data and we can issue them.
  */
 static void
 raidz_reflow_read_done(zio_t *zio)
 {
 	raidz_reflow_arg_t *rra = zio->io_private;
 	vdev_raidz_expand_t *vre = rra->rra_vre;
 
 	/* Reads of only one block use write ABDs.  For bigger free gangs. */
 	if (zio->io_size > (1 << rra->rra_ashift))
 		abd_free(zio->io_abd);
 
 	/*
 	 * If the read failed, or if it was done on a vdev that is not fully
 	 * healthy (e.g. a child that has a resilver in progress), we may not
 	 * have the correct data.  Note that it's OK if the write proceeds.
 	 * It may write garbage but the location is otherwise unused and we
 	 * will retry later due to vre_failed_offset.
 	 */
 	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
 		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
 		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_lr->lr_length,
 		    (long long)rra->rra_txg,
 		    zio->io_error,
 		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
 		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
 		mutex_enter(&vre->vre_lock);
 		/* Force a reflow pause on errors */
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		mutex_exit(&vre->vre_lock);
 	}
 
 	if (atomic_dec_32_nv(&rra->rra_tbd) > 0)
 		return;
 	uint32_t writes = rra->rra_tbd = rra->rra_writes;
 	for (uint64_t i = 0; i < writes; i++)
 		zio_nowait(rra->rra_zio[i]);
 }
 
 static void
 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
     dmu_tx_t *tx)
 {
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (offset == 0)
 		return;
 
 	mutex_enter(&vre->vre_lock);
 	ASSERT3U(vre->vre_offset, <=, offset);
 	vre->vre_offset = offset;
 	mutex_exit(&vre->vre_lock);
 
 	if (vre->vre_offset_pertxg[txgoff] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
 		    spa, tx);
 	}
 	vre->vre_offset_pertxg[txgoff] = offset;
 }
 
 static boolean_t
 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
 {
 	for (int i = 0; i < raidz_vd->vdev_children; i++) {
 		/* Quick check if a child is being replaced */
 		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static boolean_t
 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt,
     dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint_t ashift = vd->vdev_top->vdev_ashift;
 
 	zfs_range_seg_t *rs = zfs_range_tree_first(rt);
 	if (rt == NULL)
 		return (B_FALSE);
 	uint64_t offset = zfs_rs_get_start(rs, rt);
 	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
 	uint64_t size = zfs_rs_get_end(rs, rt) - offset;
 	ASSERT3U(size, >=, 1 << ashift);
 	ASSERT(IS_P2ALIGNED(size, 1 << ashift));
 
 	uint64_t blkid = offset >> ashift;
 	uint_t old_children = vd->vdev_children - 1;
 
 	/*
 	 * We can only progress to the point that writes will not overlap
 	 * with blocks whose progress has not yet been recorded on disk.
 	 * Since partially-copied rows are still read from the old location,
 	 * we need to stop one row before the sector-wise overlap, to prevent
 	 * row-wise overlap.
 	 *
 	 * Note that even if we are skipping over a large unallocated region,
 	 * we can't move the on-disk progress to `offset`, because concurrent
 	 * writes/allocations could still use the currently-unallocated
 	 * region.
 	 */
 	uint64_t ubsync_blkid =
 	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
 	uint64_t next_overwrite_blkid = ubsync_blkid +
 	    ubsync_blkid / old_children - old_children;
 	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
 	if (blkid >= next_overwrite_blkid) {
 		raidz_reflow_record_progress(vre,
 		    next_overwrite_blkid << ashift, tx);
 		return (B_TRUE);
 	}
 
 	size = MIN(size, raidz_expand_max_copy_bytes);
 	size = MIN(size, (uint64_t)old_children *
 	    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE));
 	size = MAX(size, 1 << ashift);
 	uint_t blocks = MIN(size >> ashift, next_overwrite_blkid - blkid);
 	size = (uint64_t)blocks << ashift;
 
 	zfs_range_tree_remove(rt, offset, size);
 
 	uint_t reads = MIN(blocks, old_children);
 	uint_t writes = MIN(blocks, vd->vdev_children);
 	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra) +
 	    sizeof (zio_t *) * writes, KM_SLEEP);
 	rra->rra_vre = vre;
 	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    offset, size, RL_WRITER);
 	rra->rra_txg = dmu_tx_get_txg(tx);
 	rra->rra_ashift = ashift;
 	rra->rra_tbd = reads;
 	rra->rra_writes = writes;
 
 	raidz_reflow_record_progress(vre, offset + size, tx);
 
 	/*
 	 * SCL_STATE will be released when the read and write are done,
 	 * by raidz_reflow_write_done().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	/* check if a replacing vdev was added, if so treat it as an error */
 	if (vdev_raidz_expand_child_replacing(vd)) {
 		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
 		    "offset=%llu txg=%llu",
 		    (long long)rra->rra_lr->lr_offset,
 		    (long long)rra->rra_txg);
 
 		mutex_enter(&vre->vre_lock);
 		vre->vre_failed_offset =
 		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
 		cv_signal(&vre->vre_cv);
 		mutex_exit(&vre->vre_lock);
 
 		/* drop everything we acquired */
 		spa_config_exit(spa, SCL_STATE, spa);
 		zfs_rangelock_exit(rra->rra_lr);
 		kmem_free(rra, sizeof (*rra) + sizeof (zio_t *) * writes);
 		return (B_TRUE);
 	}
 
 	mutex_enter(&vre->vre_lock);
 	vre->vre_outstanding_bytes += size;
 	mutex_exit(&vre->vre_lock);
 
 	/* Allocate ABD and ZIO for each child we write. */
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	zio_t *pio = spa->spa_txg_zio[txgoff];
 	uint_t b = blocks / vd->vdev_children;
 	uint_t bb = blocks % vd->vdev_children;
 	for (uint_t i = 0; i < writes; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd = abd_alloc_for_io(n << ashift, B_FALSE);
 		rra->rra_zio[i] = zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % vd->vdev_children],
 		    ((blkid + i) / vd->vdev_children) << ashift,
 		    abd, n << ashift, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_write_done, rra);
 	}
 
 	/*
 	 * Allocate and issue ZIO for each child we read.  For reads of only
 	 * one block we can use respective writer ABDs, since they will also
 	 * have only one block.  For bigger reads create gang ABDs and fill
 	 * them with respective blocks from writer ABDs.
 	 */
 	b = blocks / old_children;
 	bb = blocks % old_children;
 	for (uint_t i = 0; i < reads; i++) {
 		uint_t n = b + (i < bb);
 		abd_t *abd;
 		if (n > 1) {
 			abd = abd_alloc_gang();
 			for (uint_t j = 0; j < n; j++) {
 				uint_t b = j * old_children + i;
 				abd_t *cabd = abd_get_offset_size(
 				    rra->rra_zio[b % vd->vdev_children]->io_abd,
 				    (b / vd->vdev_children) << ashift,
 				    1 << ashift);
 				abd_gang_add(abd, cabd, B_TRUE);
 			}
 		} else {
 			abd = rra->rra_zio[i]->io_abd;
 		}
 		zio_nowait(zio_vdev_child_io(pio, NULL,
 		    vd->vdev_child[(blkid + i) % old_children],
 		    ((blkid + i) / old_children) << ashift, abd,
 		    n << ashift, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_reflow_read_done, rra));
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * For testing (ztest specific)
  */
 static void
 raidz_expand_pause(uint_t pause_point)
 {
 	while (raidz_expand_pause_point != 0 &&
 	    raidz_expand_pause_point <= pause_point)
 		delay(hz);
 }
 
 static void
 raidz_scratch_child_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_private;
 
 	mutex_enter(&pio->io_lock);
 	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Reflow the beginning portion of the vdev into an intermediate scratch area
  * in memory and on disk. This operation must be persisted on disk before we
  * proceed to overwrite the beginning portion with the reflowed data.
  *
  * This multi-step task can fail to complete if disk errors are encountered
  * and we can return here after a pause (waiting for disk to become healthy).
  */
 static void
 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_raidz_expand_t *vre = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zio_t *pio;
 	int error;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	int ashift = raidvd->vdev_ashift;
 	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
 	    uint64_t);
 	uint64_t logical_size = write_size * raidvd->vdev_children;
 	uint64_t read_size =
 	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
 	    1 << ashift);
 
 	/*
 	 * The scratch space must be large enough to get us to the point
 	 * that one row does not overlap itself when moved.  This is checked
 	 * by vdev_raidz_attach_check().
 	 */
 	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
 	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
 	VERIFY3U(write_size, <=, read_size);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
 	    0, logical_size, RL_WRITER);
 
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(read_size, B_FALSE);
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
 
 	/*
 	 * If we have already written the scratch area then we must read from
 	 * there, since new writes were redirected there while we were paused
 	 * or the original location may have been partially overwritten with
 	 * reflowed data.
 	 */
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
 		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
 		/*
 		 * Read from scratch space.
 		 */
 		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (int i = 0; i < raidvd->vdev_children; i++) {
 			/*
 			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
 			 * to the offset to calculate the physical offset to
 			 * write to.  Passing in a negative offset makes us
 			 * access the scratch area.
 			 */
 			zio_nowait(zio_vdev_child_io(pio, NULL,
 			    raidvd->vdev_child[i],
 			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 		}
 		error = zio_wait(pio);
 		if (error != 0) {
 			zfs_dbgmsg("reflow: error %d reading scratch location",
 			    error);
 			goto io_error_exit;
 		}
 		goto overwrite;
 	}
 
 	/*
 	 * Read from original location.
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
 		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], read_size, ZIO_TYPE_READ,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d reading original location", error);
 io_error_exit:
 		for (int i = 0; i < raidvd->vdev_children; i++)
 			abd_free(abds[i]);
 		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 		zfs_rangelock_exit(lr);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 		return;
 	}
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
 
 	/*
 	 * Reflow in memory.
 	 */
 	uint64_t logical_sectors = logical_size >> ashift;
 	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
 		int oldchild = i % (raidvd->vdev_children - 1);
 		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
 
 		int newchild = i % raidvd->vdev_children;
 		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
 
 		/* a single sector should not be copying over itself */
 		ASSERT(!(newchild == oldchild && newoff == oldoff));
 
 		abd_copy_off(abds[newchild], abds[oldchild],
 		    newoff, oldoff, 1 << ashift);
 	}
 
 	/*
 	 * Verify that we filled in everything we intended to (write_size on
 	 * each child).
 	 */
 	VERIFY0(logical_sectors % raidvd->vdev_children);
 	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
 	    write_size);
 
 	/*
 	 * Write to scratch location (boot area).
 	 */
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		zfs_dbgmsg("reflow: error %d writing scratch location", error);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
 	    (long long)logical_size);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
 
 	/*
 	 * Update uberblock to indicate that scratch space is valid.  This is
 	 * needed because after this point, the real location may be
 	 * overwritten.  If we crash, we need to get the data from the
 	 * scratch space, rather than the real location.
 	 *
 	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
 	 * will prefer this uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
 
 	/*
 	 * Overwrite with reflow'ed data.
 	 */
 overwrite:
 	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL,
 		    raidz_scratch_child_done, pio));
 	}
 	error = zio_wait(pio);
 	if (error != 0) {
 		/*
 		 * When we exit early here and drop the range lock, new
 		 * writes will go into the scratch area so we'll need to
 		 * read from there when we return after pausing.
 		 */
 		zfs_dbgmsg("reflow: error %d writing real location", error);
 		/*
 		 * Update the uberblock that is written when this txg completes.
 		 */
 		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
 		    logical_size);
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
 	    (long long)logical_size);
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
 
 	/*
 	 * Update uberblock to indicate that the initial part has been
 	 * reflow'ed.  This is needed because after this point (when we exit
 	 * the rangelock), we allow regular writes to this region, which will
 	 * be written to the new location only (because reflow_offset_next ==
 	 * reflow_offset_synced).  If we crashed and re-copied from the
 	 * scratch space, we would lose the regular writes.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
 	    logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
 
 	/*
 	 * Update progress.
 	 */
 	vre->vre_offset = logical_size;
 	zfs_rangelock_exit(lr);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note - raidz_reflow_sync() will update the uberblock state to
 	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
 }
 
 /*
  * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
  * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
  */
 void
 vdev_raidz_reflow_copy_scratch(spa_t *spa)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
 	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	ASSERT0(logical_size % raidvd->vdev_children);
 	uint64_t write_size = logical_size / raidvd->vdev_children;
 
 	zio_t *pio;
 
 	/*
 	 * Read from scratch space.
 	 */
 	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
 	    KM_SLEEP);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		abds[i] = abd_alloc_linear(write_size, B_FALSE);
 	}
 
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		/*
 		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
 		 * the offset to calculate the physical offset to write to.
 		 * Passing in a negative offset lets us access the boot area.
 		 */
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
 		    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 
 	/*
 	 * Overwrite real location with reflow'ed data.
 	 */
 	pio = zio_root(spa, NULL, NULL, 0);
 	for (int i = 0; i < raidvd->vdev_children; i++) {
 		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
 		    0, abds[i], write_size, ZIO_TYPE_WRITE,
 		    ZIO_PRIORITY_REMOVAL, 0,
 		    raidz_scratch_child_done, pio));
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
 	zio_flush(pio, raidvd);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
 	    "to real location", (long long)logical_size);
 
 	for (int i = 0; i < raidvd->vdev_children; i++)
 		abd_free(abds[i]);
 	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
 
 	/*
 	 * Update uberblock.
 	 */
 	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
 	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
 	spa->spa_ubsync.ub_timestamp++;
 	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
 	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
 	if (spa_multihost(spa))
 		mmp_update_uberblock(spa, &spa->spa_ubsync);
 
 	zfs_dbgmsg("reflow recovery: uberblock updated "
 	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
 	    (long long)spa->spa_ubsync.ub_txg,
 	    (long long)logical_size,
 	    (long long)spa->spa_ubsync.ub_timestamp);
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 	    spa_first_txg(spa));
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 	vre->vre_offset = logical_size;
 	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
 	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
 	/*
 	 * Note that raidz_reflow_sync() will update the uberblock once more
 	 */
 	raidz_reflow_sync(spa, tx);
 
 	dmu_tx_commit(tx);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
 
 static boolean_t
 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
 {
 	(void) zthr;
 	spa_t *spa = arg;
 
 	return (spa->spa_raidz_expand != NULL &&
 	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
 }
 
 /*
  * RAIDZ expansion background thread
  *
  * Can be called multiple times if the reflow is paused
  */
 static void
 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
 {
 	spa_t *spa = arg;
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
 		vre->vre_offset = 0;
 	else
 		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
 
 	/* Reflow the begining portion using the scratch area */
 	if (vre->vre_offset == 0) {
 		VERIFY0(dsl_sync_task(spa_name(spa),
 		    NULL, raidz_reflow_scratch_sync,
 		    vre, 0, ZFS_SPACE_CHECK_NONE));
 
 		/* if we encountered errors then pause */
 		if (vre->vre_offset == 0) {
 			mutex_enter(&vre->vre_lock);
 			vre->vre_waiting_for_resilver = B_TRUE;
 			mutex_exit(&vre->vre_lock);
 			return;
 		}
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 	uint64_t guid = raidvd->vdev_guid;
 
 	/* Iterate over all the remaining metaslabs */
 	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
 	    i < raidvd->vdev_ms_count &&
 	    !zthr_iscancelled(zthr) &&
 	    vre->vre_failed_offset == UINT64_MAX; i++) {
 		metaslab_t *msp = raidvd->vdev_ms[i];
 
 		metaslab_disable(msp);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * The metaslab may be newly created (for the expanded
 		 * space), in which case its trees won't exist yet,
 		 * so we need to bail out early.
 		 */
 		if (msp->ms_new) {
 			mutex_exit(&msp->ms_lock);
 			metaslab_enable(msp, B_FALSE, B_FALSE);
 			continue;
 		}
 
 		VERIFY0(metaslab_load(msp));
 
 		/*
 		 * We want to copy everything except the free (allocatable)
 		 * space.  Note that there may be a little bit more free
 		 * space (e.g. in ms_defer), and it's fine to copy that too.
 		 */
 		uint64_t shift, start;
 		zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(
 		    raidvd, msp, &start, &shift);
 		zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL,
 		    start, shift);
 		zfs_range_tree_add(rt, msp->ms_start, msp->ms_size);
 		zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove,
 		    rt);
 		mutex_exit(&msp->ms_lock);
 
 		/*
 		 * Force the last sector of each metaslab to be copied.  This
 		 * ensures that we advance the on-disk progress to the end of
 		 * this metaslab while the metaslab is disabled.  Otherwise, we
 		 * could move past this metaslab without advancing the on-disk
 		 * progress, and then an allocation to this metaslab would not
 		 * be copied.
 		 */
 		int sectorsz = 1 << raidvd->vdev_ashift;
 		uint64_t ms_last_offset = msp->ms_start +
 		    msp->ms_size - sectorsz;
 		if (!zfs_range_tree_contains(rt, ms_last_offset, sectorsz)) {
 			zfs_range_tree_add(rt, ms_last_offset, sectorsz);
 		}
 
 		/*
 		 * When we are resuming from a paused expansion (i.e.
 		 * when importing a pool with a expansion in progress),
 		 * discard any state that we have already processed.
 		 */
 		if (vre->vre_offset > msp->ms_start) {
 			zfs_range_tree_clear(rt, msp->ms_start,
 			    vre->vre_offset - msp->ms_start);
 		}
 
 		while (!zthr_iscancelled(zthr) &&
 		    !zfs_range_tree_is_empty(rt) &&
 		    vre->vre_failed_offset == UINT64_MAX) {
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * If requested, pause the reflow when the amount
 			 * specified by raidz_expand_max_reflow_bytes is reached
 			 *
 			 * This pause is only used during testing or debugging.
 			 */
 			while (raidz_expand_max_reflow_bytes != 0 &&
 			    raidz_expand_max_reflow_bytes <=
 			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
 				delay(hz);
 			}
 
 			mutex_enter(&vre->vre_lock);
 			while (vre->vre_outstanding_bytes >
 			    raidz_expand_max_copy_bytes) {
 				cv_wait(&vre->vre_cv, &vre->vre_lock);
 			}
 			mutex_exit(&vre->vre_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  Theoretically, the
 			 * vdev_t that we're expanding may have changed.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 
 			boolean_t needsync =
 			    raidz_reflow_impl(raidvd, vre, rt, tx);
 
 			dmu_tx_commit(tx);
 
 			if (needsync) {
 				spa_config_exit(spa, SCL_CONFIG, FTAG);
 				txg_wait_synced(spa->spa_dsl_pool, txg);
 				spa_config_enter(spa, SCL_CONFIG, FTAG,
 				    RW_READER);
 			}
 		}
 
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 		metaslab_enable(msp, B_FALSE, B_FALSE);
 		zfs_range_tree_vacate(rt, NULL, NULL);
 		zfs_range_tree_destroy(rt);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	}
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/*
 	 * The txg_wait_synced() here ensures that all reflow zio's have
 	 * completed, and vre_failed_offset has been set if necessary.  It
 	 * also ensures that the progress of the last raidz_reflow_sync() is
 	 * written to disk before raidz_reflow_complete_sync() changes the
 	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
 	 * determine if a reflow is in progress, in which case we may need to
 	 * write to both old and new locations.  Therefore we can only change
 	 * vre_state once this is not necessary, which is once the on-disk
 	 * progress (in spa_ubsync) has been set past any possible writes (to
 	 * the end of the last metaslab).
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	if (!zthr_iscancelled(zthr) &&
 	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
 		/*
 		 * We are not being canceled or paused, so the reflow must be
 		 * complete. In that case also mark it as completed on disk.
 		 */
 		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 		    raidz_reflow_complete_sync, spa,
 		    0, ZFS_SPACE_CHECK_NONE));
 		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
 	} else {
 		/*
 		 * Wait for all copy zio's to complete and for all the
 		 * raidz_reflow_sync() synctasks to be run.
 		 */
 		spa_history_log_internal(spa, "reflow pause",
 		    NULL, "offset=%llu failed_offset=%lld",
 		    (long long)vre->vre_offset,
 		    (long long)vre->vre_failed_offset);
 		mutex_enter(&vre->vre_lock);
 		if (vre->vre_failed_offset != UINT64_MAX) {
 			/*
 			 * Reset progress so that we will retry everything
 			 * after the point that something failed.
 			 */
 			vre->vre_offset = vre->vre_failed_offset;
 			vre->vre_failed_offset = UINT64_MAX;
 			vre->vre_waiting_for_resilver = B_TRUE;
 		}
 		mutex_exit(&vre->vre_lock);
 	}
 }
 
 void
 spa_start_raidz_expansion_thread(spa_t *spa)
 {
 	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
 	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
 	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
 	    spa, defclsyspri);
 }
 
 void
 raidz_dtl_reassessed(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	if (spa->spa_raidz_expand != NULL) {
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		/*
 		 * we get called often from vdev_dtl_reassess() so make
 		 * sure it's our vdev and any replacing is complete
 		 */
 		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
 		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
 			mutex_enter(&vre->vre_lock);
 			if (vre->vre_waiting_for_resilver) {
 				vdev_dbgmsg(vd, "DTL reassessed, "
 				    "continuing raidz expansion");
 				vre->vre_waiting_for_resilver = B_FALSE;
 				zthr_wakeup(spa->spa_raidz_expand_zthr);
 			}
 			mutex_exit(&vre->vre_lock);
 		}
 	}
 }
 
 int
 vdev_raidz_attach_check(vdev_t *new_child)
 {
 	vdev_t *raidvd = new_child->vdev_parent;
 	uint64_t new_children = raidvd->vdev_children;
 
 	/*
 	 * We use the "boot" space as scratch space to handle overwriting the
 	 * initial part of the vdev.  If it is too small, then this expansion
 	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
 	 * >200 children).
 	 */
 	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
 		return (EINVAL);
 	}
 	return (0);
 }
 
 void
 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *new_child = arg;
 	spa_t *spa = new_child->vdev_spa;
 	vdev_t *raidvd = new_child->vdev_parent;
 	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
 	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
 	ASSERT3P(raidvd->vdev_top, ==, raidvd);
 	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
 	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
 	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
 	    new_child);
 
 	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
 
 	vdrz->vd_physical_width++;
 
 	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
 	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
 	vdrz->vn_vre.vre_offset = 0;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	spa->spa_raidz_expand = &vdrz->vn_vre;
 	zthr_wakeup(spa->spa_raidz_expand_zthr);
 
 	/*
 	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
 	 * written to the config.
 	 */
 	vdev_config_dirty(raidvd);
 
 	vdrz->vn_vre.vre_start_time = gethrestime_sec();
 	vdrz->vn_vre.vre_end_time = 0;
 	vdrz->vn_vre.vre_state = DSS_SCANNING;
 	vdrz->vn_vre.vre_bytes_copied = 0;
 
 	uint64_t state = vdrz->vn_vre.vre_state;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 	    sizeof (state), 1, &state, tx));
 
 	uint64_t start_time = vdrz->vn_vre.vre_start_time;
 	VERIFY0(zap_update(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 	    sizeof (start_time), 1, &start_time, tx));
 
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
 	(void) zap_remove(spa->spa_meta_objset,
 	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
 
 	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
 	    "%s vdev %llu new width %llu", spa_name(spa),
 	    (unsigned long long)raidvd->vdev_id,
 	    (unsigned long long)raidvd->vdev_children);
 }
 
 int
 vdev_raidz_load(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	int err;
 
 	uint64_t state = DSS_NONE;
 	uint64_t start_time = 0;
 	uint64_t end_time = 0;
 	uint64_t bytes_copied = 0;
 
 	if (vd->vdev_top_zap != 0) {
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
 		    sizeof (state), 1, &state);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
 		    sizeof (start_time), 1, &start_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
 		    sizeof (end_time), 1, &end_time);
 		if (err != 0 && err != ENOENT)
 			return (err);
 
 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
 		    sizeof (bytes_copied), 1, &bytes_copied);
 		if (err != 0 && err != ENOENT)
 			return (err);
 	}
 
 	/*
 	 * If we are in the middle of expansion, vre_state should have
 	 * already been set by vdev_raidz_init().
 	 */
 	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
 	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
 	vdrz->vn_vre.vre_start_time = start_time;
 	vdrz->vn_vre.vre_end_time = end_time;
 	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
 
 	return (0);
 }
 
 int
 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
 {
 	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 
 	if (vre == NULL) {
 		/* no removal in progress; find most recent completed */
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
 			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
 			if (vd->vdev_ops == &vdev_raidz_ops) {
 				vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 				if (vdrz->vn_vre.vre_end_time != 0 &&
 				    (vre == NULL ||
 				    vdrz->vn_vre.vre_end_time >
 				    vre->vre_end_time)) {
 					vre = &vdrz->vn_vre;
 				}
 			}
 		}
 	}
 
 	if (vre == NULL) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	pres->pres_state = vre->vre_state;
 	pres->pres_expanding_vdev = vre->vre_vdev_id;
 
 	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
 	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
 
 	mutex_enter(&vre->vre_lock);
 	pres->pres_reflowed = vre->vre_bytes_copied;
 	for (int i = 0; i < TXG_SIZE; i++)
 		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
 	mutex_exit(&vre->vre_lock);
 
 	pres->pres_start_time = vre->vre_start_time;
 	pres->pres_end_time = vre->vre_end_time;
 	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
 
 	return (0);
 }
 
 /*
  * Initialize private RAIDZ specific fields from the nvlist.
  */
 static int
 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
 {
 	uint_t children;
 	nvlist_t **child;
 	int error = nvlist_lookup_nvlist_array(nv,
 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
 	if (error != 0)
 		return (SET_ERROR(EINVAL));
 
 	uint64_t nparity;
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Previous versions could only support 1 or 2 parity
 		 * device.
 		 */
 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
 			return (SET_ERROR(EINVAL));
 	} else {
 		/*
 		 * We require the parity to be specified for SPAs that
 		 * support multiple parity levels.
 		 */
 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * Otherwise, we default to 1 parity device for RAID-Z.
 		 */
 		nparity = 1;
 	}
 
 	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
 	vdrz->vn_vre.vre_vdev_id = -1;
 	vdrz->vn_vre.vre_offset = UINT64_MAX;
 	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
 	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
 	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
 	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
 	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
 
 	vdrz->vd_physical_width = children;
 	vdrz->vd_nparity = nparity;
 
 	/* note, the ID does not exist when creating a pool */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
 	    &vdrz->vn_vre.vre_vdev_id);
 
 	boolean_t reflow_in_progress =
 	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	if (reflow_in_progress) {
 		spa->spa_raidz_expand = &vdrz->vn_vre;
 		vdrz->vn_vre.vre_state = DSS_SCANNING;
 	}
 
 	vdrz->vd_original_width = children;
 	uint64_t *txgs;
 	unsigned int txgs_size = 0;
 	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 	    &txgs, &txgs_size);
 	if (error == 0) {
 		for (int i = 0; i < txgs_size; i++) {
 			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
 			re->re_txg = txgs[txgs_size - i - 1];
 			re->re_logical_width = vdrz->vd_physical_width - i;
 
 			if (reflow_in_progress)
 				re->re_logical_width--;
 
 			avl_add(&vdrz->vd_expand_txgs, re);
 		}
 
 		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
 	}
 	if (reflow_in_progress) {
 		vdrz->vd_original_width--;
 		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
 		    children, txgs_size);
 	}
 
 	*tsd = vdrz;
 
 	return (0);
 }
 
 static void
 vdev_raidz_fini(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
 		vd->vdev_spa->spa_raidz_expand = NULL;
 	reflow_node_t *re;
 	void *cookie = NULL;
 	avl_tree_t *tree = &vdrz->vd_expand_txgs;
 	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
 		kmem_free(re, sizeof (*re));
 	avl_destroy(&vdrz->vd_expand_txgs);
 	mutex_destroy(&vdrz->vd_expand_lock);
 	mutex_destroy(&vdrz->vn_vre.vre_lock);
 	cv_destroy(&vdrz->vn_vre.vre_cv);
 	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
 	kmem_free(vdrz, sizeof (*vdrz));
 }
 
 /*
  * Add RAIDZ specific fields to the config nvlist.
  */
 static void
 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
 {
 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 
 	/*
 	 * Make sure someone hasn't managed to sneak a fancy new vdev
 	 * into a crufty old storage pool.
 	 */
 	ASSERT(vdrz->vd_nparity == 1 ||
 	    (vdrz->vd_nparity <= 2 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
 	    (vdrz->vd_nparity <= 3 &&
 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
 
 	/*
 	 * Note that we'll add these even on storage pools where they
 	 * aren't strictly required -- older software will just ignore
 	 * it.
 	 */
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
 
 	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
 		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	}
 
 	mutex_enter(&vdrz->vd_expand_lock);
 	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
 		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
 		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
 		    KM_SLEEP);
 		uint64_t i = 0;
 
 		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
 		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
 			txgs[i++] = re->re_txg;
 		}
 
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
 		    txgs, count);
 
 		kmem_free(txgs, sizeof (uint64_t) * count);
 	}
 	mutex_exit(&vdrz->vd_expand_lock);
 }
 
 static uint64_t
 vdev_raidz_nparity(vdev_t *vd)
 {
 	vdev_raidz_t *vdrz = vd->vdev_tsd;
 	return (vdrz->vd_nparity);
 }
 
 static uint64_t
 vdev_raidz_ndisks(vdev_t *vd)
 {
 	return (vd->vdev_children);
 }
 
 vdev_ops_t vdev_raidz_ops = {
 	.vdev_op_init = vdev_raidz_init,
 	.vdev_op_fini = vdev_raidz_fini,
 	.vdev_op_open = vdev_raidz_open,
 	.vdev_op_close = vdev_raidz_close,
-	.vdev_op_asize = vdev_raidz_asize,
+	.vdev_op_psize_to_asize = vdev_raidz_psize_to_asize,
+	.vdev_op_asize_to_psize = vdev_raidz_asize_to_psize,
 	.vdev_op_min_asize = vdev_raidz_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = vdev_raidz_io_start,
 	.vdev_op_io_done = vdev_raidz_io_done,
 	.vdev_op_state_change = vdev_raidz_state_change,
 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_raidz_xlate,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = vdev_raidz_config_generate,
 	.vdev_op_nparity = vdev_raidz_nparity,
 	.vdev_op_ndisks = vdev_raidz_ndisks,
 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
 };
 
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
 	"For testing, pause RAIDZ expansion after reflowing this many bytes");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
 	"Max amount of concurrent i/o for RAIDZ expansion");
 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
 	"For expanded RAIDZ, aggregate reads that have more rows than this");
 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
 	"For expanded RAIDZ, automatically start a pool scrub when expansion "
 	"completes");
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
index ea6f86993088..21cb57e38b12 100644
--- a/module/zfs/vdev_rebuild.c
+++ b/module/zfs/vdev_rebuild.c
@@ -1,1182 +1,1182 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  *
  * Copyright (c) 2018, Intel Corporation.
  * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2024 by Delphix. All rights reserved.
  */
 
 #include <sys/vdev_impl.h>
 #include <sys/vdev_draid.h>
 #include <sys/dsl_scan.h>
 #include <sys/spa_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/zio.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/zap.h>
 
 /*
  * This file contains the sequential reconstruction implementation for
  * resilvering.  This form of resilvering is internally referred to as device
  * rebuild to avoid conflating it with the traditional healing reconstruction
  * performed by the dsl scan code.
  *
  * When replacing a device, or scrubbing the pool, ZFS has historically used
  * a process called resilvering which is a form of healing reconstruction.
  * This approach has the advantage that as blocks are read from disk their
  * checksums can be immediately verified and the data repaired.  Unfortunately,
  * it also results in a random IO pattern to the disk even when extra care
  * is taken to sequentialize the IO as much as possible.  This substantially
  * increases the time required to resilver the pool and restore redundancy.
  *
  * For mirrored devices it's possible to implement an alternate sequential
  * reconstruction strategy when resilvering.  Sequential reconstruction
  * behaves like a traditional RAID rebuild and reconstructs a device in LBA
  * order without verifying the checksum.  After this phase completes a second
  * scrub phase is started to verify all of the checksums.  This two phase
  * process will take longer than the healing reconstruction described above.
  * However, it has that advantage that after the reconstruction first phase
  * completes redundancy has been restored.  At this point the pool can incur
  * another device failure without risking data loss.
  *
  * There are a few noteworthy limitations and other advantages of resilvering
  * using sequential reconstruction vs healing reconstruction.
  *
  * Limitations:
  *
  *   - Sequential reconstruction is not possible on RAIDZ due to its
  *     variable stripe width.  Note dRAID uses a fixed stripe width which
  *     avoids this issue, but comes at the expense of some usable capacity.
  *
  *   - Block checksums are not verified during sequential reconstruction.
  *     Similar to traditional RAID the parity/mirror data is reconstructed
  *     but cannot be immediately double checked.  For this reason when the
  *     last active resilver completes the pool is automatically scrubbed
  *     by default.
  *
  *   - Deferred resilvers using sequential reconstruction are not currently
  *     supported.  When adding another vdev to an active top-level resilver
  *     it must be restarted.
  *
  * Advantages:
  *
  *   - Sequential reconstruction is performed in LBA order which may be faster
  *     than healing reconstruction particularly when using HDDs (or
  *     especially with SMR devices).  Only allocated capacity is resilvered.
  *
  *   - Sequential reconstruction is not constrained by ZFS block boundaries.
  *     This allows it to issue larger IOs to disk which span multiple blocks
  *     allowing all of these logical blocks to be repaired with a single IO.
  *
  *   - Unlike a healing resilver or scrub which are pool wide operations,
  *     sequential reconstruction is handled by the top-level vdevs.  This
  *     allows for it to be started or canceled on a top-level vdev without
  *     impacting any other top-level vdevs in the pool.
  *
  *   - Data only referenced by a pool checkpoint will be repaired because
  *     that space is reflected in the space maps.  This differs for a
  *     healing resilver or scrub which will not repair that data.
  */
 
 
 /*
  * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
  * SPA_MAXBLOCKSIZE.
  */
 static uint64_t zfs_rebuild_max_segment = 1024 * 1024;
 
 /*
  * Maximum number of parallelly executed bytes per leaf vdev caused by a
  * sequential resilver.  We attempt to strike a balance here between keeping
  * the vdev queues full of I/Os at all times and not overflowing the queues
  * to cause long latency, which would cause long txg sync times.
  *
  * A large default value can be safely used here because the default target
  * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep
  * the queue depth short.
  *
  * 64MB was observed to deliver the best performance and set as the default.
  * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c)
  * and a rebuild rate of 1.2GB/s was measured to the distribute spare.
  * Smaller values were unable to fully saturate the available pool I/O.
  */
 static uint64_t zfs_rebuild_vdev_limit = 64 << 20;
 
 /*
  * Automatically start a pool scrub when the last active sequential resilver
  * completes in order to verify the checksums of all blocks which have been
  * resilvered. This option is enabled by default and is strongly recommended.
  */
 static int zfs_rebuild_scrub_enabled = 1;
 
 /*
  * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
  */
 static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg);
 static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx);
 
 /*
  * Clear the per-vdev rebuild bytes value for a vdev tree.
  */
 static void
 clear_rebuild_bytes(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		clear_rebuild_bytes(vd->vdev_child[i]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_rebuild_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 /*
  * Determines whether a vdev_rebuild_thread() should be stopped.
  */
 static boolean_t
 vdev_rebuild_should_stop(vdev_t *vd)
 {
 	return (!vdev_writeable(vd) || vd->vdev_removing ||
 	    vd->vdev_rebuild_exit_wanted ||
 	    vd->vdev_rebuild_cancel_wanted ||
 	    vd->vdev_rebuild_reset_wanted);
 }
 
 /*
  * Determine if the rebuild should be canceled.  This may happen when all
  * vdevs with MISSING DTLs are detached.
  */
 static boolean_t
 vdev_rebuild_should_cancel(vdev_t *vd)
 {
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * The sync task for updating the on-disk state of a rebuild.  This is
  * scheduled by vdev_rebuild_range().
  */
 static void
 vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	if (vr->vr_scan_offset[txg & TXG_MASK] > 0) {
 		vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK];
 		vr->vr_scan_offset[txg & TXG_MASK] = 0;
 	}
 
 	vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms +
 	    NSEC2MSEC(gethrtime() - vr->vr_pass_start_time);
 
 	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 }
 
 /*
  * Initialize the on-disk state for a new rebuild, start the rebuild thread.
  */
 static void
 vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	ASSERT(vd->vdev_rebuilding);
 
 	spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 	memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 	vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
 	vrp->vrp_min_txg = 0;
 	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
 	vrp->vrp_start_time = gethrestime_sec();
 	vrp->vrp_scan_time_ms = 0;
 	vr->vr_prev_scan_time_ms = 0;
 
 	/*
 	 * Rebuilds are currently only used when replacing a device, in which
 	 * case there must be DTL_MISSING entries.  In the future, we could
 	 * allow rebuilds to be used in a way similar to a scrub.  This would
 	 * be useful because it would allow us to rebuild the space used by
 	 * pool checkpoints.
 	 */
 	VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
 
 	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));
 
 	spa_history_log_internal(spa, "rebuild", tx,
 	    "vdev_id=%llu vdev_guid=%llu started",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
 
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 	vd->vdev_rebuild_thread = thread_create(NULL, 0,
 	    vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 }
 
 static void
 vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 	nvlist_t *aux = fnvlist_alloc();
 
 	fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential");
 	spa_event_notify(spa, vd, aux, name);
 	nvlist_free(aux);
 }
 
 /*
  * Called to request that a new rebuild be started.  The feature will remain
  * active for the duration of the rebuild, then revert to the enabled state.
  */
 static void
 vdev_rebuild_initiate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock));
 	ASSERT(!vd->vdev_rebuilding);
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 
 	vd->vdev_rebuilding = B_TRUE;
 
 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
 	    (void *)(uintptr_t)vd->vdev_id, tx);
 	dmu_tx_commit(tx);
 
 	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
 }
 
 /*
  * Update the on-disk state to completed when a rebuild finishes.
  */
 static void
 vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	/*
 	 * Handle a second device failure if it occurs after all rebuild I/O
 	 * has completed but before this sync task has been executed.
 	 */
 	if (vd->vdev_rebuild_reset_wanted) {
 		mutex_exit(&vd->vdev_rebuild_lock);
 		vdev_rebuild_reset_sync(arg, tx);
 		return;
 	}
 
 	vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
 	vrp->vrp_end_time = gethrestime_sec();
 
 	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));
 
 	vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
 	spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
 
 	spa_history_log_internal(spa, "rebuild",  tx,
 	    "vdev_id=%llu vdev_guid=%llu complete",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
 	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
 
 	/* Handles detaching of spares */
 	spa_async_request(spa, SPA_ASYNC_REBUILD_DONE);
 	vd->vdev_rebuilding = B_FALSE;
 	mutex_exit(&vd->vdev_rebuild_lock);
 
 	/*
 	 * While we're in syncing context take the opportunity to
 	 * setup the scrub when there are no more active rebuilds.
 	 */
 	setup_sync_arg_t setup_sync_arg = {
 		.func = POOL_SCAN_SCRUB,
 		.txgstart = 0,
 		.txgend = 0,
 	};
 	if (dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 &&
 	    zfs_rebuild_scrub_enabled) {
 		dsl_scan_setup_sync(&setup_sync_arg, tx);
 	}
 
 	cv_broadcast(&vd->vdev_rebuild_cv);
 
 	/* Clear recent error events (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, NULL);
 }
 
 /*
  * Update the on-disk state to canceled when a rebuild finishes.
  */
 static void
 vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 	vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED;
 	vrp->vrp_end_time = gethrestime_sec();
 
 	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));
 
 	spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
 
 	spa_history_log_internal(spa, "rebuild",  tx,
 	    "vdev_id=%llu vdev_guid=%llu canceled",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
 	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
 
 	vd->vdev_rebuild_cancel_wanted = B_FALSE;
 	vd->vdev_rebuilding = B_FALSE;
 	mutex_exit(&vd->vdev_rebuild_lock);
 
 	spa_notify_waiters(spa);
 	cv_broadcast(&vd->vdev_rebuild_cv);
 }
 
 /*
  * Resets the progress of a running rebuild.  This will occur when a new
  * vdev is added to rebuild.
  */
 static void
 vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
 	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
 	vrp->vrp_last_offset = 0;
 	vrp->vrp_min_txg = 0;
 	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
 	vrp->vrp_bytes_scanned = 0;
 	vrp->vrp_bytes_issued = 0;
 	vrp->vrp_bytes_rebuilt = 0;
 	vrp->vrp_bytes_est = 0;
 	vrp->vrp_scan_time_ms = 0;
 	vr->vr_prev_scan_time_ms = 0;
 
 	/* See vdev_rebuild_initiate_sync comment */
 	VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
 
 	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp, tx));
 
 	spa_history_log_internal(spa, "rebuild",  tx,
 	    "vdev_id=%llu vdev_guid=%llu reset",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
 
 	vd->vdev_rebuild_reset_wanted = B_FALSE;
 	ASSERT(vd->vdev_rebuilding);
 
 	vd->vdev_rebuild_thread = thread_create(NULL, 0,
 	    vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 }
 
 /*
  * Clear the last rebuild status.
  */
 void
 vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	objset_t *mos = spa_meta_objset(spa);
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) ||
 	    vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) {
 		mutex_exit(&vd->vdev_rebuild_lock);
 		return;
 	}
 
 	clear_rebuild_bytes(vd);
 	memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 
 	if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
 		VERIFY0(zap_update(mos, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 		    REBUILD_PHYS_ENTRIES, vrp, tx));
 	}
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 }
 
 /*
  * The zio_done_func_t callback for each rebuild I/O issued.  It's responsible
  * for updating the rebuild stats and limiting the number of in flight I/Os.
  */
 static void
 vdev_rebuild_cb(zio_t *zio)
 {
 	vdev_rebuild_t *vr = zio->io_private;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	vdev_t *vd = vr->vr_top_vdev;
 
 	mutex_enter(&vr->vr_io_lock);
 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
 		/*
 		 * The I/O failed because the top-level vdev was unavailable.
 		 * Attempt to roll back to the last completed offset, in order
 		 * resume from the correct location if the pool is resumed.
 		 * (This works because spa_sync waits on spa_txg_zio before
 		 * it runs sync tasks.)
 		 */
 		uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK];
 		*off = MIN(*off, zio->io_offset);
 	} else if (zio->io_error) {
 		vrp->vrp_errors++;
 	}
 
 	abd_free(zio->io_abd);
 
 	ASSERT3U(vr->vr_bytes_inflight, >, 0);
 	vr->vr_bytes_inflight -= zio->io_size;
 	cv_broadcast(&vr->vr_io_cv);
 	mutex_exit(&vr->vr_io_lock);
 
 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 }
 
 /*
  * Initialize a block pointer that can be used to read the given segment
  * for sequential rebuild.
  */
 static void
 vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
     uint64_t asize)
 {
 	ASSERT(vd->vdev_ops == &vdev_draid_ops ||
 	    vd->vdev_ops == &vdev_mirror_ops ||
 	    vd->vdev_ops == &vdev_replacing_ops ||
 	    vd->vdev_ops == &vdev_spare_ops);
 
 	uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
-	    vdev_draid_asize_to_psize(vd, asize) : asize;
+	    vdev_draid_asize_to_psize(vd, asize, 0) : asize;
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], start);
 	DVA_SET_GANG(&bp->blk_dva[0], 0);
 	DVA_SET_ASIZE(&bp->blk_dva[0], asize);
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 	BP_SET_LSIZE(bp, psize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 }
 
 /*
  * Issues a rebuild I/O and takes care of rate limiting the number of queued
  * rebuild I/Os.  The provided start and size must be properly aligned for the
  * top-level vdev type being rebuilt.
  */
 static int
 vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
 {
 	uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
 	vdev_t *vd = vr->vr_top_vdev;
 	spa_t *spa = vd->vdev_spa;
 	blkptr_t blk;
 
 	ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
 	ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
 
 	vr->vr_pass_bytes_scanned += size;
 	vr->vr_rebuild_phys.vrp_bytes_scanned += size;
 
 	/*
 	 * Rebuild the data in this range by constructing a special block
 	 * pointer.  It has no relation to any existing blocks in the pool.
 	 * However, by disabling checksum verification and issuing a scrub IO
 	 * we can reconstruct and repair any children with missing data.
 	 */
 	vdev_rebuild_blkptr_init(&blk, vd, start, size);
 	uint64_t psize = BP_GET_PSIZE(&blk);
 
 	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
 		vr->vr_pass_bytes_skipped += size;
 		return (0);
 	}
 
 	mutex_enter(&vr->vr_io_lock);
 
 	/* Limit in flight rebuild I/Os */
 	while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
 		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
 
 	vr->vr_bytes_inflight += psize;
 	mutex_exit(&vr->vr_io_lock);
 
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	/* This is the first I/O for this txg. */
 	if (vr->vr_scan_offset[txg & TXG_MASK] == 0) {
 		vr->vr_scan_offset[txg & TXG_MASK] = start;
 		dsl_sync_task_nowait(spa_get_dsl(spa),
 		    vdev_rebuild_update_sync,
 		    (void *)(uintptr_t)vd->vdev_id, tx);
 	}
 
 	/* When exiting write out our progress. */
 	if (vdev_rebuild_should_stop(vd)) {
 		mutex_enter(&vr->vr_io_lock);
 		vr->vr_bytes_inflight -= psize;
 		mutex_exit(&vr->vr_io_lock);
 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
 		mutex_exit(&vd->vdev_rebuild_lock);
 		dmu_tx_commit(tx);
 		return (SET_ERROR(EINTR));
 	}
 	mutex_exit(&vd->vdev_rebuild_lock);
 	dmu_tx_commit(tx);
 
 	vr->vr_scan_offset[txg & TXG_MASK] = start + size;
 	vr->vr_pass_bytes_issued += size;
 	vr->vr_rebuild_phys.vrp_bytes_issued += size;
 
 	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
 	    abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
 	    ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_RESILVER, NULL));
 
 	return (0);
 }
 
 /*
  * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
  */
 static int
 vdev_rebuild_ranges(vdev_rebuild_t *vr)
 {
 	vdev_t *vd = vr->vr_top_vdev;
 	zfs_btree_t *t = &vr->vr_scan_tree->rt_root;
 	zfs_btree_index_t idx;
 	int error;
 
 	for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
 	    rs = zfs_btree_next(t, &idx, &idx)) {
 		uint64_t start = zfs_rs_get_start(rs, vr->vr_scan_tree);
 		uint64_t size = zfs_rs_get_end(rs, vr->vr_scan_tree) - start;
 
 		/*
 		 * zfs_scan_suspend_progress can be set to disable rebuild
 		 * progress for testing.  See comment in dsl_scan_sync().
 		 */
 		while (zfs_scan_suspend_progress &&
 		    !vdev_rebuild_should_stop(vd)) {
 			delay(hz);
 		}
 
 		while (size > 0) {
 			uint64_t chunk_size;
 
 			/*
 			 * Split range into legally-sized logical chunks
 			 * given the constraints of the top-level vdev
 			 * being rebuilt (dRAID or mirror).
 			 */
 			ASSERT3P(vd->vdev_ops, !=, NULL);
 			chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
 			    start, size, zfs_rebuild_max_segment);
 
 			error = vdev_rebuild_range(vr, start, chunk_size);
 			if (error != 0)
 				return (error);
 
 			size -= chunk_size;
 			start += chunk_size;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Calculates the estimated capacity which remains to be scanned.  Since
  * we traverse the pool in metaslab order only allocated capacity beyond
  * the vrp_last_offset need be considered.  All lower offsets must have
  * already been rebuilt and are thus already included in vrp_bytes_scanned.
  */
 static void
 vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id)
 {
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	uint64_t bytes_est = vrp->vrp_bytes_scanned;
 
 	if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start)
 		return;
 
 	for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_ms[i];
 
 		mutex_enter(&msp->ms_lock);
 		bytes_est += metaslab_allocated_space(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	vrp->vrp_bytes_est = bytes_est;
 }
 
 /*
  * Load from disk the top-level vdev's rebuild information.
  */
 int
 vdev_rebuild_load(vdev_t *vd)
 {
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	spa_t *spa = vd->vdev_spa;
 	int err = 0;
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 	vd->vdev_rebuilding = B_FALSE;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
 		memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 		mutex_exit(&vd->vdev_rebuild_lock);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	ASSERT(vd->vdev_top == vd);
 
 	err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
 	    REBUILD_PHYS_ENTRIES, vrp);
 
 	/*
 	 * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should
 	 * not prevent a pool from being imported.  Clear the rebuild
 	 * status allowing a new resilver/rebuild to be started.
 	 */
 	if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
 		memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
 	} else if (err) {
 		mutex_exit(&vd->vdev_rebuild_lock);
 		return (err);
 	}
 
 	vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms;
 	vr->vr_top_vdev = vd;
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 
 	return (0);
 }
 
 /*
  * Each scan thread is responsible for rebuilding a top-level vdev.  The
  * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
  */
 static __attribute__((noreturn)) void
 vdev_rebuild_thread(void *arg)
 {
 	vdev_t *vd = arg;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int error = 0;
 
 	/*
 	 * If there's a scrub in process request that it be stopped.  This
 	 * is not required for a correct rebuild, but we do want rebuilds to
 	 * emulate the resilver behavior as much as possible.
 	 */
 	dsl_pool_t *dsl = spa_get_dsl(spa);
 	if (dsl_scan_scrubbing(dsl))
 		dsl_scan_cancel(dsl);
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	mutex_enter(&vd->vdev_rebuild_lock);
 
 	ASSERT3P(vd->vdev_top, ==, vd);
 	ASSERT3P(vd->vdev_rebuild_thread, !=, NULL);
 	ASSERT(vd->vdev_rebuilding);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
 	ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
 
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 	vr->vr_top_vdev = vd;
 	vr->vr_scan_msp = NULL;
 	vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL,
 	    0, 0);
 	mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
 
 	vr->vr_pass_start_time = gethrtime();
 	vr->vr_pass_bytes_scanned = 0;
 	vr->vr_pass_bytes_issued = 0;
 	vr->vr_pass_bytes_skipped = 0;
 
 	uint64_t update_est_time = gethrtime();
 	vdev_rebuild_update_bytes_est(vd, 0);
 
 	clear_rebuild_bytes(vr->vr_top_vdev);
 
 	mutex_exit(&vd->vdev_rebuild_lock);
 
 	/*
 	 * Systematically walk the metaslabs and issue rebuild I/Os for
 	 * all ranges in the allocated space map.
 	 */
 	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 		metaslab_t *msp = vd->vdev_ms[i];
 		vr->vr_scan_msp = msp;
 
 		/*
 		 * Calculate the max number of in-flight bytes for top-level
 		 * vdev scanning operations (minimum 1MB, maximum 1/2 of
 		 * arc_c_max shared by all top-level vdevs).  Limits for the
 		 * issuing phase are done per top-level vdev and are handled
 		 * separately.
 		 */
 		uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1);
 		vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20,
 		    zfs_rebuild_vdev_limit * vd->vdev_children));
 
 		/*
 		 * Removal of vdevs from the vdev tree may eliminate the need
 		 * for the rebuild, in which case it should be canceled.  The
 		 * vdev_rebuild_cancel_wanted flag is set until the sync task
 		 * completes.  This may be after the rebuild thread exits.
 		 */
 		if (vdev_rebuild_should_cancel(vd)) {
 			vd->vdev_rebuild_cancel_wanted = B_TRUE;
 			error = EINTR;
 			break;
 		}
 
 		ASSERT0(zfs_range_tree_space(vr->vr_scan_tree));
 
 		/* Disable any new allocations to this metaslab */
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		metaslab_disable(msp);
 
 		mutex_enter(&msp->ms_sync_lock);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * If there are outstanding allocations wait for them to be
 		 * synced.  This is needed to ensure all allocated ranges are
 		 * on disk and therefore will be rebuilt.
 		 */
 		for (int j = 0; j < TXG_SIZE; j++) {
 			if (zfs_range_tree_space(msp->ms_allocating[j])) {
 				mutex_exit(&msp->ms_lock);
 				mutex_exit(&msp->ms_sync_lock);
 				txg_wait_synced(dsl, 0);
 				mutex_enter(&msp->ms_sync_lock);
 				mutex_enter(&msp->ms_lock);
 				break;
 			}
 		}
 
 		/*
 		 * When a metaslab has been allocated from read its allocated
 		 * ranges from the space map object into the vr_scan_tree.
 		 * Then add inflight / unflushed ranges and remove inflight /
 		 * unflushed frees.  This is the minimum range to be rebuilt.
 		 */
 		if (msp->ms_sm != NULL) {
 			VERIFY0(space_map_load(msp->ms_sm,
 			    vr->vr_scan_tree, SM_ALLOC));
 
 			for (int i = 0; i < TXG_SIZE; i++) {
 				ASSERT0(zfs_range_tree_space(
 				    msp->ms_allocating[i]));
 			}
 
 			zfs_range_tree_walk(msp->ms_unflushed_allocs,
 			    zfs_range_tree_add, vr->vr_scan_tree);
 			zfs_range_tree_walk(msp->ms_unflushed_frees,
 			    zfs_range_tree_remove, vr->vr_scan_tree);
 
 			/*
 			 * Remove ranges which have already been rebuilt based
 			 * on the last offset.  This can happen when restarting
 			 * a scan after exporting and re-importing the pool.
 			 */
 			zfs_range_tree_clear(vr->vr_scan_tree, 0,
 			    vrp->vrp_last_offset);
 		}
 
 		mutex_exit(&msp->ms_lock);
 		mutex_exit(&msp->ms_sync_lock);
 
 		/*
 		 * To provide an accurate estimate re-calculate the estimated
 		 * size every 5 minutes to account for recent allocations and
 		 * frees made to space maps which have not yet been rebuilt.
 		 */
 		if (gethrtime() > update_est_time + SEC2NSEC(300)) {
 			update_est_time = gethrtime();
 			vdev_rebuild_update_bytes_est(vd, i);
 		}
 
 		/*
 		 * Walk the allocated space map and issue the rebuild I/O.
 		 */
 		error = vdev_rebuild_ranges(vr);
 		zfs_range_tree_vacate(vr->vr_scan_tree, NULL, NULL);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		metaslab_enable(msp, B_FALSE, B_FALSE);
 
 		if (error != 0)
 			break;
 	}
 
 	zfs_range_tree_destroy(vr->vr_scan_tree);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	/* Wait for any remaining rebuild I/O to complete */
 	mutex_enter(&vr->vr_io_lock);
 	while (vr->vr_bytes_inflight > 0)
 		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
 
 	mutex_exit(&vr->vr_io_lock);
 
 	mutex_destroy(&vr->vr_io_lock);
 	cv_destroy(&vr->vr_io_cv);
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 	if (error == 0) {
 		/*
 		 * After a successful rebuild clear the DTLs of all ranges
 		 * which were missing when the rebuild was started.  These
 		 * ranges must have been rebuilt as a consequence of rebuilding
 		 * all allocated space.  Note that unlike a scrub or resilver
 		 * the rebuild operation will reconstruct data only referenced
 		 * by a pool checkpoint.  See the dsl_scan_done() comments.
 		 */
 		dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
 		    (void *)(uintptr_t)vd->vdev_id, tx);
 	} else if (vd->vdev_rebuild_cancel_wanted) {
 		/*
 		 * The rebuild operation was canceled.  This will occur when
 		 * a device participating in the rebuild is detached.
 		 */
 		dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
 		    (void *)(uintptr_t)vd->vdev_id, tx);
 	} else if (vd->vdev_rebuild_reset_wanted) {
 		/*
 		 * Reset the running rebuild without canceling and restarting
 		 * it.  This will occur when a new device is attached and must
 		 * participate in the rebuild.
 		 */
 		dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
 		    (void *)(uintptr_t)vd->vdev_id, tx);
 	} else {
 		/*
 		 * The rebuild operation should be suspended.  This may occur
 		 * when detaching a child vdev or when exporting the pool.  The
 		 * rebuild is left in the active state so it will be resumed.
 		 */
 		ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
 		vd->vdev_rebuilding = B_FALSE;
 	}
 
 	dmu_tx_commit(tx);
 
 	vd->vdev_rebuild_thread = NULL;
 	mutex_exit(&vd->vdev_rebuild_lock);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	cv_broadcast(&vd->vdev_rebuild_cv);
 
 	thread_exit();
 }
 
 /*
  * Returns B_TRUE if any top-level vdev are rebuilding.
  */
 boolean_t
 vdev_rebuild_active(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t ret = B_FALSE;
 
 	if (vd == spa->spa_root_vdev) {
 		for (uint64_t i = 0; i < vd->vdev_children; i++) {
 			ret = vdev_rebuild_active(vd->vdev_child[i]);
 			if (ret)
 				return (ret);
 		}
 	} else if (vd->vdev_top_zap != 0) {
 		vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		mutex_enter(&vd->vdev_rebuild_lock);
 		ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
 		mutex_exit(&vd->vdev_rebuild_lock);
 	}
 
 	return (ret);
 }
 
 /*
  * Start a rebuild operation.  The rebuild may be restarted when the
  * top-level vdev is currently actively rebuilding.
  */
 void
 vdev_rebuild(vdev_t *vd)
 {
 	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 	vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys;
 
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(!vd->vdev_removing);
 	ASSERT(spa_feature_is_enabled(vd->vdev_spa,
 	    SPA_FEATURE_DEVICE_REBUILD));
 
 	mutex_enter(&vd->vdev_rebuild_lock);
 	if (vd->vdev_rebuilding) {
 		ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE);
 
 		/*
 		 * Signal a running rebuild operation that it should restart
 		 * from the beginning because a new device was attached.  The
 		 * vdev_rebuild_reset_wanted flag is set until the sync task
 		 * completes.  This may be after the rebuild thread exits.
 		 */
 		if (!vd->vdev_rebuild_reset_wanted)
 			vd->vdev_rebuild_reset_wanted = B_TRUE;
 	} else {
 		vdev_rebuild_initiate(vd);
 	}
 	mutex_exit(&vd->vdev_rebuild_lock);
 }
 
 static void
 vdev_rebuild_restart_impl(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd == spa->spa_root_vdev) {
 		for (uint64_t i = 0; i < vd->vdev_children; i++)
 			vdev_rebuild_restart_impl(vd->vdev_child[i]);
 
 	} else if (vd->vdev_top_zap != 0) {
 		vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		mutex_enter(&vd->vdev_rebuild_lock);
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE &&
 		    vdev_writeable(vd) && !vd->vdev_rebuilding) {
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_DEVICE_REBUILD));
 			vd->vdev_rebuilding = B_TRUE;
 			vd->vdev_rebuild_thread = thread_create(NULL, 0,
 			    vdev_rebuild_thread, vd, 0, &p0, TS_RUN,
 			    maxclsyspri);
 		}
 		mutex_exit(&vd->vdev_rebuild_lock);
 	}
 }
 
 /*
  * Conditionally restart all of the vdev_rebuild_thread's for a pool.  The
  * feature flag must be active and the rebuild in the active state.   This
  * cannot be used to start a new rebuild.
  */
 void
 vdev_rebuild_restart(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_load_thread == curthread);
 
 	vdev_rebuild_restart_impl(spa->spa_root_vdev);
 }
 
 /*
  * Stop and wait for all of the vdev_rebuild_thread's associated with the
  * vdev tree provide to be terminated (canceled or stopped).
  */
 void
 vdev_rebuild_stop_wait(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	if (vd == spa->spa_root_vdev) {
 		for (uint64_t i = 0; i < vd->vdev_children; i++)
 			vdev_rebuild_stop_wait(vd->vdev_child[i]);
 
 	} else if (vd->vdev_top_zap != 0) {
 		ASSERT(vd == vd->vdev_top);
 
 		mutex_enter(&vd->vdev_rebuild_lock);
 		if (vd->vdev_rebuild_thread != NULL) {
 			vd->vdev_rebuild_exit_wanted = B_TRUE;
 			while (vd->vdev_rebuilding) {
 				cv_wait(&vd->vdev_rebuild_cv,
 				    &vd->vdev_rebuild_lock);
 			}
 			vd->vdev_rebuild_exit_wanted = B_FALSE;
 		}
 		mutex_exit(&vd->vdev_rebuild_lock);
 	}
 }
 
 /*
  * Stop all rebuild operations but leave them in the active state so they
  * will be resumed when importing the pool.
  */
 void
 vdev_rebuild_stop_all(spa_t *spa)
 {
 	vdev_rebuild_stop_wait(spa->spa_root_vdev);
 }
 
 /*
  * Rebuild statistics reported per top-level vdev.
  */
 int
 vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
 {
 	spa_t *spa = tvd->vdev_spa;
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 		return (SET_ERROR(ENOTSUP));
 
 	if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0)
 		return (SET_ERROR(EINVAL));
 
 	int error = zap_contains(spa_meta_objset(spa),
 	    tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
 
 	if (error == ENOENT) {
 		memset(vrs, 0, sizeof (vdev_rebuild_stat_t));
 		vrs->vrs_state = VDEV_REBUILD_NONE;
 		error = 0;
 	} else if (error == 0) {
 		vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		mutex_enter(&tvd->vdev_rebuild_lock);
 		vrs->vrs_state = vrp->vrp_rebuild_state;
 		vrs->vrs_start_time = vrp->vrp_start_time;
 		vrs->vrs_end_time = vrp->vrp_end_time;
 		vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms;
 		vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned;
 		vrs->vrs_bytes_issued = vrp->vrp_bytes_issued;
 		vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt;
 		vrs->vrs_bytes_est = vrp->vrp_bytes_est;
 		vrs->vrs_errors = vrp->vrp_errors;
 		vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() -
 		    vr->vr_pass_start_time);
 		vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
 		vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
 		vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
 		mutex_exit(&tvd->vdev_rebuild_lock);
 	}
 
 	return (error);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW,
 	"Max segment size in bytes of rebuild reads");
 
 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW,
 	"Max bytes in flight per leaf vdev for sequential resilvers");
 
 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
 	"Automatically scrub after sequential resilver completes");
diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c
index 8f6e49f25e1e..21a81d6d25b9 100644
--- a/module/zfs/vdev_root.c
+++ b/module/zfs/vdev_root.c
@@ -1,168 +1,169 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 
 /*
  * Virtual device vector for the pool's root vdev.
  */
 
 static uint64_t
 vdev_root_core_tvds(vdev_t *vd)
 {
 	uint64_t tvds = 0;
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (!cvd->vdev_ishole && !cvd->vdev_islog &&
 		    cvd->vdev_ops != &vdev_indirect_ops) {
 			tvds++;
 		}
 	}
 
 	return (tvds);
 }
 
 /*
  * We should be able to tolerate one failure with absolutely no damage
  * to our metadata.  Two failures will take out space maps, a bunch of
  * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
  * place to live.  When we get smarter, we can liberalize this policy.
  * e.g. If we haven't lost two consecutive top-level vdevs, then we are
  * probably fine.  Adding bean counters during alloc/free can make this
  * future guesswork more accurate.
  */
 static boolean_t
 too_many_errors(vdev_t *vd, uint64_t numerrors)
 {
 	uint64_t tvds;
 
 	if (numerrors == 0)
 		return (B_FALSE);
 
 	tvds = vdev_root_core_tvds(vd);
 	ASSERT3U(numerrors, <=, tvds);
 
 	if (numerrors == tvds)
 		return (B_TRUE);
 
 	return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
 }
 
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
     uint64_t *ashift, uint64_t *pshift)
 {
 	spa_t *spa = vd->vdev_spa;
 	int lasterror = 0;
 	int numerrors = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (cvd->vdev_open_error && !cvd->vdev_islog &&
 		    cvd->vdev_ops != &vdev_indirect_ops) {
 			lasterror = cvd->vdev_open_error;
 			numerrors++;
 		}
 	}
 
 	if (spa_load_state(spa) != SPA_LOAD_NONE)
 		spa_set_missing_tvds(spa, numerrors);
 
 	if (too_many_errors(vd, numerrors)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 		return (lasterror);
 	}
 
 	*asize = 0;
 	*max_asize = 0;
 	*ashift = 0;
 	*pshift = 0;
 
 	return (0);
 }
 
 static void
 vdev_root_close(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
 static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
 	if (too_many_errors(vd, faulted)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	} else if (degraded || faulted) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
 	} else {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 	}
 }
 
 vdev_ops_t vdev_root_ops = {
 	.vdev_op_init = NULL,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_root_open,
 	.vdev_op_close = vdev_root_close,
-	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_psize_to_asize = vdev_default_asize,
+	.vdev_op_asize_to_psize = vdev_default_psize,
 	.vdev_op_min_asize = vdev_default_min_asize,
 	.vdev_op_min_alloc = NULL,
 	.vdev_op_io_start = NULL,	/* not applicable to the root */
 	.vdev_op_io_done = NULL,	/* not applicable to the root */
 	.vdev_op_state_change = vdev_root_state_change,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = NULL,
 	.vdev_op_rele = NULL,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = NULL,
 	.vdev_op_rebuild_asize = NULL,
 	.vdev_op_metaslab_init = NULL,
 	.vdev_op_config_generate = NULL,
 	.vdev_op_nparity = NULL,
 	.vdev_op_ndisks = NULL,
 	.vdev_op_type = VDEV_TYPE_ROOT,	/* name of this vdev type */
 	.vdev_op_leaf = B_FALSE		/* not a leaf vdev */
 };
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index eb08a6eac3ed..1769606ebb8a 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1,5842 +1,5881 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, 2023, 2024, 2025, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2021, Datto, Inc.
  * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_trim.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/dsl_crypt.h>
 #include <cityhash.h>
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *const zio_type_name[ZIO_TYPES] = {
 	/*
 	 * Note: Linux kernel thread name length is limited
 	 * so these names will differ from upstream open zfs.
 	 */
 	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_flush", "z_trim"
 };
 
 int zio_dva_throttle_enabled = B_TRUE;
 static int zio_deadman_log_all = B_FALSE;
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 static kmem_cache_t *zio_cache;
 static kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 #endif
 
 /* Mark IOs as "slow" if they take longer than 30 seconds */
 static uint_t zio_slow_io_ms = (30 * MILLISEC);
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  *
  * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
  * compression (including of metadata).  In practice, we don't have this
  * many sync passes, so this has no effect.
  *
  * The original intent was that disabling compression would help the sync
  * passes to converge. However, in practice disabling compression increases
  * the average number of sync passes, because when we turn compression off, a
  * lot of block's size will change and thus we have to re-allocate (not
  * overwrite) them. It also increases the number of 128KB allocations (e.g.
  * for indirect blocks and spacemaps) because these will not be compressed.
  * The 128K allocations are especially detrimental to performance on highly
  * fragmented systems, which may have very few free segments of this size,
  * and may need to load new metaslabs to satisfy 128K allocations.
  */
 
 /* defer frees starting in this pass */
 uint_t zfs_sync_pass_deferred_free = 2;
 
 /* don't compress starting in this pass */
 static uint_t zfs_sync_pass_dont_compress = 8;
 
 /* rewrite new bps starting in this pass */
 static uint_t zfs_sync_pass_rewrite = 2;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 /*
  * Enable smaller cores by excluding metadata
  * allocations as well.
  */
 int zio_exclude_metadata = 0;
 static int zio_requeue_io_start_cut_in_line = 1;
 
 #ifdef ZFS_DEBUG
 static const int zio_buf_debug_limit = 16384;
 #else
 static const int zio_buf_debug_limit = 0;
 #endif
 
 typedef struct zio_stats {
 	kstat_named_t ziostat_total_allocations;
 	kstat_named_t ziostat_alloc_class_fallbacks;
 	kstat_named_t ziostat_gang_writes;
 	kstat_named_t ziostat_gang_multilevel;
 } zio_stats_t;
 
 static zio_stats_t zio_stats = {
 	{ "total_allocations",	KSTAT_DATA_UINT64 },
 	{ "alloc_class_fallbacks",	KSTAT_DATA_UINT64 },
 	{ "gang_writes",	KSTAT_DATA_UINT64 },
 	{ "gang_multilevel",	KSTAT_DATA_UINT64 },
 };
 
 struct {
 	wmsum_t ziostat_total_allocations;
 	wmsum_t ziostat_alloc_class_fallbacks;
 	wmsum_t ziostat_gang_writes;
 	wmsum_t ziostat_gang_multilevel;
 } ziostat_sums;
 
 #define	ZIOSTAT_BUMP(stat)	wmsum_add(&ziostat_sums.stat, 1);
 
 static kstat_t *zio_ksp;
 
 static inline void __zio_execute(zio_t *zio);
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 static int
 zio_kstats_update(kstat_t *ksp, int rw)
 {
 	zio_stats_t *zs = ksp->ks_data;
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	zs->ziostat_total_allocations.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_total_allocations);
 	zs->ziostat_alloc_class_fallbacks.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_alloc_class_fallbacks);
 	zs->ziostat_gang_writes.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_gang_writes);
 	zs->ziostat_gang_multilevel.value.ui64 =
 	    wmsum_value(&ziostat_sums.ziostat_gang_multilevel);
 	return (0);
 }
 
 void
 zio_init(void)
 {
 	size_t c;
 
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	wmsum_init(&ziostat_sums.ziostat_total_allocations, 0);
 	wmsum_init(&ziostat_sums.ziostat_alloc_class_fallbacks, 0);
 	wmsum_init(&ziostat_sums.ziostat_gang_writes, 0);
 	wmsum_init(&ziostat_sums.ziostat_gang_multilevel, 0);
 	zio_ksp = kstat_create("zfs", 0, "zio_stats",
 	    "misc", KSTAT_TYPE_NAMED, sizeof (zio_stats) /
 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (zio_ksp != NULL) {
 		zio_ksp->ks_data = &zio_stats;
 		zio_ksp->ks_update = zio_kstats_update;
 		kstat_install(zio_ksp);
 	}
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t align, cflags, data_cflags;
 		char name[32];
 
 		/*
 		 * Create cache for each half-power of 2 size, starting from
 		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
 		 * of ~7/8, sufficient for transient allocations mostly using
 		 * these caches.
 		 */
 		size_t p2 = size;
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 		if (!IS_P2ALIGNED(size, p2 / 2))
 			continue;
 
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 
 		if (IS_P2ALIGNED(size, PAGESIZE))
 			align = PAGESIZE;
 		else
 			align = 1 << (highbit64(size ^ (size - 1)) - 1);
 
 		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
 		    KMC_NODEBUG : 0;
 		data_cflags = KMC_NODEBUG;
 		if (abd_size_alloc_linear(size)) {
 			cflags |= KMC_RECLAIMABLE;
 			data_cflags |= KMC_RECLAIMABLE;
 		}
 		if (cflags == data_cflags) {
 			/*
 			 * Resulting kmem caches would be identical.
 			 * Save memory by creating only one.
 			 */
 			(void) snprintf(name, sizeof (name),
 			    "zio_buf_comb_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size, align,
 			    NULL, NULL, NULL, NULL, NULL, cflags);
 			zio_data_buf_cache[c] = zio_buf_cache[c];
 			continue;
 		}
 		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
 		    (ulong_t)size);
 		zio_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, cflags);
 
 		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
 		    (ulong_t)size);
 		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
 		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
 	zio_inject_init();
 
 	lz4_init();
 }
 
 void
 zio_fini(void)
 {
 	size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
 
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	for (size_t i = 0; i < n; i++) {
 		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
 			(void) printf("zio_fini: [%d] %llu != %llu\n",
 			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
 			    (long long unsigned)zio_buf_cache_allocs[i],
 			    (long long unsigned)zio_buf_cache_frees[i]);
 	}
 #endif
 
 	/*
 	 * The same kmem cache can show up multiple times in both zio_buf_cache
 	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
 	 * sort it out.
 	 */
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_buf_cache[j])
 				zio_buf_cache[j] = NULL;
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		kmem_cache_t *cache = zio_data_buf_cache[i];
 		if (cache == NULL)
 			continue;
 		for (size_t j = i; j < n; j++) {
 			if (cache == zio_data_buf_cache[j])
 				zio_data_buf_cache[j] = NULL;
 		}
 		kmem_cache_destroy(cache);
 	}
 
 	for (size_t i = 0; i < n; i++) {
 		VERIFY3P(zio_buf_cache[i], ==, NULL);
 		VERIFY3P(zio_data_buf_cache[i], ==, NULL);
 	}
 
 	if (zio_ksp != NULL) {
 		kstat_delete(zio_ksp);
 		zio_ksp = NULL;
 	}
 
 	wmsum_fini(&ziostat_sums.ziostat_total_allocations);
 	wmsum_fini(&ziostat_sums.ziostat_alloc_class_fallbacks);
 	wmsum_fini(&ziostat_sums.ziostat_gang_writes);
 	wmsum_fini(&ziostat_sums.ziostat_gang_multilevel);
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	lz4_fini();
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 #if defined(ZFS_DEBUG) && defined(_KERNEL)
 #define	ZFS_ZIO_BUF_CANARY	1
 #endif
 
 #ifdef ZFS_ZIO_BUF_CANARY
 static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 
 /*
  * Use empty space after the buffer to detect overflows.
  *
  * Since zio_init() creates kmem caches only for certain set of buffer sizes,
  * allocations of different sizes may have some unused space after the data.
  * Filling part of that space with a known pattern on allocation and checking
  * it on free should allow us to detect some buffer overflows.
  */
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
 	if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT &&
 	    cache[c] == cache[c + 1])
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t)) {
 		if (unlikely(*canary != zio_buf_canary)) {
 			PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx",
 			    p, size, (canary - p) * sizeof (ulong_t),
 			    *canary, zio_buf_canary);
 		}
 	}
 }
 #endif
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_allocs[c], 1);
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
 #endif
 	return (p);
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
 #endif
 	return (p);
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 #if defined(ZFS_DEBUG) && !defined(_KERNEL)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
 #endif
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 #ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
 #endif
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 
 static void
 zio_abd_free(void *abd, size_t size)
 {
 	(void) size;
 	abd_free((abd_t *)abd);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks, decompression, and decryption
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, data, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 static void
 zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 {
 	int ret;
 	void *tmp;
 	blkptr_t *bp = zio->io_bp;
 	spa_t *spa = zio->io_spa;
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	uint64_t lsize = BP_GET_LSIZE(bp);
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(size, !=, 0);
 
 	if (zio->io_error != 0)
 		return;
 
 	/*
 	 * Verify the cksum of MACs stored in an indirect bp. It will always
 	 * be possible to verify this since it does not require an encryption
 	 * key.
 	 */
 	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 			/*
 			 * We haven't decompressed the data yet, but
 			 * zio_crypt_do_indirect_mac_checksum() requires
 			 * decompressed data to be able to parse out the MACs
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
 			abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
 			    zio->io_abd, abd, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
 				abd_free(abd);
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
 			abd_free(abd);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
 			ret = zio_handle_decrypt_injection(spa,
 			    &zio->io_bookmark, ot, ECKSUM);
 		}
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	/*
 	 * If this is an authenticated block, just check the MAC. It would be
 	 * nice to separate this out into its own flag, but when this was done,
 	 * we had run out of bits in what is now zio_flag_t. Future cleanup
 	 * could make this a flag bit.
 	 */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		if (ot == DMU_OT_OBJSET) {
 			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
 			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
 		} else {
 			zio_crypt_decode_mac_bp(bp, mac);
 			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
 			    zio->io_abd, size, mac);
 			if (zio_injection_enabled && ret == 0) {
 				ret = zio_handle_decrypt_injection(spa,
 				    &zio->io_bookmark, ot, ECKSUM);
 			}
 		}
 		abd_copy(data, zio->io_abd, size);
 
 		if (ret != 0)
 			goto error;
 
 		return;
 	}
 
 	zio_crypt_decode_params_bp(bp, salt, iv);
 
 	if (ot == DMU_OT_INTENT_LOG) {
 		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
 		zio_crypt_decode_mac_zil(tmp, mac);
 		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
 	} else {
 		zio_crypt_decode_mac_bp(bp, mac);
 	}
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
 	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
 	    zio->io_abd, &no_crypt);
 	if (no_crypt)
 		abd_copy(data, zio->io_abd, size);
 
 	if (ret != 0)
 		goto error;
 
 	return;
 
 error:
 	/* assert that the key was found unless this was speculative */
 	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	/*
 	 * If there was a decryption / authentication error return EIO as
 	 * the io_error. If this was not a speculative zio, create an ereport.
 	 */
 	if (ret == ECKSUM) {
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	} else {
 		zio->io_error = ret;
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	ASSERT(MUTEX_HELD(&pio->io_lock));
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 }
 
 void
 zio_add_child_first(zio_t *pio, zio_t *cio)
 {
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	/* Parent should not have READY stage if child doesn't have it. */
 	IMPLY((cio->io_pipeline & ZIO_STAGE_READY) == 0 &&
 	    (cio->io_child_type != ZIO_CHILD_VDEV),
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0);
 
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	ASSERT(list_is_empty(&cio->io_parent_list));
 	list_insert_head(&cio->io_parent_list, zl);
 
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	uint64_t *countp = pio->io_children[cio->io_child_type];
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		countp[w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&pio->io_lock);
 	mutex_enter(&cio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	mutex_exit(&cio->io_lock);
 	mutex_exit(&pio->io_lock);
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 __attribute__((always_inline))
 static inline void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
     zio_t **next_to_executep)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	/*
 	 * Propogate the Direct I/O checksum verify failure to the parent.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)
 		pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 
 		/*
 		 * If we can tell the caller to execute this parent next, do
 		 * so. We do this if the parent's zio type matches the child's
 		 * type, or if it's a zio_null() with no done callback, and so
 		 * has no actual work to do. Otherwise dispatch the parent zio
 		 * in its own taskq.
 		 *
 		 * Having the caller execute the parent when possible reduces
 		 * locking on the zio taskq's, reduces context switch
 		 * overhead, and has no recursion penalty.  Note that one
 		 * read from disk typically causes at least 3 zio's: a
 		 * zio_null(), the logical zio_read(), and then a physical
 		 * zio.  When the physical ZIO completes, we are able to call
 		 * zio_done() on all 3 of these zio's from one invocation of
 		 * zio_execute() by returning the parent back to
 		 * zio_execute().  Since the parent isn't executed until this
 		 * thread returns back to zio_execute(), the caller should do
 		 * so promptly.
 		 *
 		 * In other cases, dispatching the parent prevents
 		 * overflowing the stack when we have deeply nested
 		 * parent-child relationships, as we do with the "mega zio"
 		 * of writes for spa_sync(), and the chain of ZIL blocks.
 		 */
 		if (next_to_executep != NULL && *next_to_executep == NULL &&
 		    (pio->io_type == zio->io_type ||
 		    (pio->io_type == ZIO_TYPE_NULL && !pio->io_done))) {
 			*next_to_executep = pio;
 		} else {
 			zio_taskq_dispatch(pio, type, B_FALSE);
 		}
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage,
     enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	memset(zio, 0, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT) {
 			zio->io_bp_copy = *bp;
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		} else {
 			zio->io_bp = (blkptr_t *)bp;
 		}
 		zio->io_bp_orig = *bp;
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
+		if (flags & ZIO_FLAG_PREALLOCATED) {
+			BP_ZERO_DVAS(zio->io_bp);
+			BP_SET_BIRTH(zio->io_bp, 0, 0);
+		}
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 	zio->io_allocator = ZIO_ALLOCATOR_NONE;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY) ||
 	    (pipeline & ZIO_STAGE_READY) == 0;
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		zio->io_metaslab_class = pio->io_metaslab_class;
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child_first(pio, zio);
 	}
 
 	taskq_init_ent(&zio->io_tqent);
 
 	return (zio);
 }
 
 void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 /*
  * ZIO intended to be between others.  Provides synchronization at READY
  * and DONE pipeline stages and calls the respective callbacks.
  */
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 /*
  * ZIO intended to be a root of a tree.  Unlike null ZIO does not have a
  * READY pipeline stage (is ready on creation), so it should not be used
  * as child of any ZIO that may need waiting for grandchildren READY stage
  * (any other ZIO type).
  */
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(NULL, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_ROOT_PIPELINE);
 
 	return (zio);
 }
 
 static int
 zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
     enum blk_verify_flag blk_verify, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	zfs_dbgmsg("bad blkptr at %px: "
 	    "DVA[0]=%#llx/%#llx "
 	    "DVA[1]=%#llx/%#llx "
 	    "DVA[2]=%#llx/%#llx "
 	    "prop=%#llx "
 	    "pad=%#llx,%#llx "
 	    "phys_birth=%#llx "
 	    "birth=%#llx "
 	    "fill=%#llx "
 	    "cksum=%#llx/%#llx/%#llx/%#llx",
 	    bp,
 	    (long long)bp->blk_dva[0].dva_word[0],
 	    (long long)bp->blk_dva[0].dva_word[1],
 	    (long long)bp->blk_dva[1].dva_word[0],
 	    (long long)bp->blk_dva[1].dva_word[1],
 	    (long long)bp->blk_dva[2].dva_word[0],
 	    (long long)bp->blk_dva[2].dva_word[1],
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
 	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
 	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
 	    (long long)bp->blk_cksum.zc_word[2],
 	    (long long)bp->blk_cksum.zc_word[3]);
 	switch (blk_verify) {
 	case BLK_VERIFY_HALT:
 		zfs_panic_recover("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_LOG:
 		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
 		break;
 	case BLK_VERIFY_ONLY:
 		break;
 	}
 
 	return (1);
 }
 
 /*
  * Verify the block pointer fields contain reasonable values.  This means
  * it only contains known object types, checksum/compression identifiers,
  * block sizes within the maximum allowed limits, valid DVAs, etc.
  *
  * If everything checks out 0 is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
  * Values for blk_verify_flag:
  *   BLK_VERIFY_ONLY: evaluate the block
  *   BLK_VERIFY_LOG: evaluate the block and log problems
  *   BLK_VERIFY_HALT: call zfs_panic_recover on error
  *
  * Values for blk_config_flag:
  *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
  *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
  *   obtained for reader
  *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
  *   performance
  */
 int
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
 	int errors = 0;
 
 	if (unlikely(!DMU_OT_IS_VALID(BP_GET_TYPE(bp)))) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (unlikely(BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (unlikely(BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_IS_EMBEDDED(bp)) {
 		if (unlikely(BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 		if (unlikely(BPE_GET_PSIZE(bp) > BPE_PAYLOAD_SIZE)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px has invalid PSIZE %llu",
 			    bp, (longlong_t)BPE_GET_PSIZE(bp));
 		}
 		return (errors ? ECKSUM : 0);
 	} else if (BP_IS_HOLE(bp)) {
 		/*
 		 * Holes are allowed (expected, even) to have no DVAs, no
 		 * checksum, and no psize.
 		 */
 		return (errors ? ECKSUM : 0);
 	} else if (unlikely(!DVA_IS_VALID(&bp->blk_dva[0]))) {
 		/* Non-hole, non-embedded BPs _must_ have at least one DVA */
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has no valid DVAs", bp);
 	}
 	if (unlikely(BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (unlikely(BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE)) {
 		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 		    "blkptr at %px has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	/*
 	 * Do not verify individual DVAs if the config is not trusted. This
 	 * will be done once the zio is executed in vdev_mirror_map_alloc.
 	 */
 	if (unlikely(!spa->spa_trust_config))
 		return (errors ? ECKSUM : 0);
 
 	switch (blk_config) {
 	case BLK_CONFIG_HELD:
 		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
 		break;
 	case BLK_CONFIG_NEEDED:
 		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
 		break;
 	case BLK_CONFIG_NEEDED_TRY:
 		if (!spa_config_tryenter(spa, SCL_VDEV, bp, RW_READER))
 			return (EBUSY);
 		break;
 	case BLK_CONFIG_SKIP:
 		return (errors ? ECKSUM : 0);
 	default:
 		panic("invalid blk_config %u", blk_config);
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the logical birth
 	 * and physical birth are not too large.  However,
 	 * spa_freeze() allows the birth time of log blocks (and
 	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
 	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
 		uint64_t vdevid = DVA_GET_VDEV(dva);
 
 		if (unlikely(vdevid >= spa->spa_root_vdev->vdev_children)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (unlikely(vd == NULL)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (unlikely(vd->vdev_ops == &vdev_hole_ops)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has hole VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(dva);
 		uint64_t asize = DVA_GET_ASIZE(dva);
 		if (DVA_GET_GANG(dva))
 			asize = vdev_gang_header_asize(vd);
 		if (unlikely(offset + asize > vd->vdev_asize)) {
 			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
 			    "blkptr at %px DVA %u has invalid OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 	if (blk_config == BLK_CONFIG_NEEDED || blk_config ==
 	    BLK_CONFIG_NEEDED_TRY)
 		spa_config_exit(spa, SCL_VDEV, bp);
 
 	return (errors ? ECKSUM : 0);
 }
 
 boolean_t
 zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
 {
 	(void) bp;
 	uint64_t vdevid = DVA_GET_VDEV(dva);
 
 	if (vdevid >= spa->spa_root_vdev->vdev_children)
 		return (B_FALSE);
 
 	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 	if (vd == NULL)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_hole_ops)
 		return (B_FALSE);
 
 	if (vd->vdev_ops == &vdev_missing_ops) {
 		return (B_FALSE);
 	}
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t asize = DVA_GET_ASIZE(dva);
 
 	if (DVA_GET_GANG(dva))
 		asize = vdev_gang_header_asize(vd);
 	if (offset + asize > vd->vdev_asize)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 	enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ?
 	    ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE;
 
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, pipeline);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim). Encrypted
 	 * dedup blocks need data as well so we also disable dedup in this
 	 * case.
 	 */
 	if (data == NULL &&
 	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
     boolean_t nopwrite, boolean_t brtwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 	ASSERT(!brtwrite || !nopwrite);
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_brtwrite = brtwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_prop.zp_gang_copies = gang_copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	(void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 *
 	 * Note that we only defer frees after zfs_sync_pass_deferred_free
 	 * when the log space map feature is disabled. [see relevant comment
 	 * in spa_sync_iterate_to_convergence()]
 	 */
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
 	    brt_maybe_exists(spa, bp)) {
 		metaslab_check_free(spa, bp);
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
 	}
 }
 
 /*
  * To improve performance, this function may return NULL if we were able
  * to do the free immediately.  This avoids the cost of creating a zio
  * (and linking it to the parent, etc).
  */
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_flag_t flags)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (NULL);
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 	dsl_scan_freed(spa, bp);
 
 	if (BP_IS_GANG(bp) ||
 	    BP_GET_DEDUP(bp) ||
 	    brt_maybe_exists(spa, bp)) {
 		/*
 		 * GANG, DEDUP and BRT blocks can induce a read (for the gang
 		 * block header, the DDT or the BRT), so issue them
 		 * asynchronously so that this thread is not tied up.
 		 */
 		enum zio_stage stage =
 		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
 
 		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 		    BP_GET_PSIZE(bp), NULL, NULL,
 		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
 	} else {
 		metaslab_free(spa, bp, txg, B_FALSE);
 		return (NULL);
 	}
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, zio_flag_t flags)
 {
 	zio_t *zio;
 
 	(void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *private, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags)
 {
 	zio_t *zio;
 
 	ASSERT0(vd->vdev_children);
 	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	ASSERT3U(size, !=, 0);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
 	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
 	zio->io_trim_flags = trim_flags;
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, zio_flag_t flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	/*
 	 * vdev child I/Os do not propagate their error to the parent.
 	 * Therefore, for correct operation the caller *must* check for
 	 * and handle the error in the child i/o's done callback.
 	 * The only exceptions are i/os that we don't care about
 	 * (OPTIONAL or REPAIR).
 	 */
 	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
 	    done != NULL);
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 		/*
 		 * We never allow the mirror VDEV to attempt reading from any
 		 * additional data copies after the first Direct I/O checksum
 		 * verify failure. This is to avoid bad data being written out
 		 * through the mirror during self healing. See comment in
 		 * vdev_mirror_io_done() for more details.
 		 */
 		ASSERT0(pio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 	} else if (type == ZIO_TYPE_WRITE &&
 	    pio->io_prop.zp_direct_write == B_TRUE) {
 		/*
 		 * By default we only will verify checksums for Direct I/O
 		 * writes for Linux. FreeBSD is able to place user pages under
 		 * write protection before issuing them to the ZIO pipeline.
 		 *
 		 * Checksum validation errors will only be reported through
 		 * the top-level VDEV, which is set by this child ZIO.
 		 */
 		ASSERT3P(bp, !=, NULL);
 		ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 		pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		ASSERT0(vd->vdev_children);
 		offset += VDEV_LABEL_START_SIZE;
 	}
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		ASSERT(pio->io_metaslab_class != NULL);
 		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     zio_type_t type, zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 
 /*
  * Send a flush command to the given vdev. Unlike most zio creation functions,
  * the flush zios are issued immediately. You can wait on pio to pause until
  * the flushes complete.
  */
 void
 zio_flush(zio_t *pio, vdev_t *vd)
 {
 	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
 	    ZIO_FLAG_DONT_RETRY;
 
 	if (vd->vdev_nowritecache)
 		return;
 
 	if (vd->vdev_children == 0) {
 		zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0,
 		    NULL, NULL, ZIO_TYPE_FLUSH, ZIO_PRIORITY_NOW, flags, vd, 0,
 		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
 			zio_flush(pio, vd->vdev_child[c]);
 	}
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * Round provided allocation size up to a value that can be allocated
  * by at least some vdev(s) in the pool with minimum or no additional
  * padding and without extra space usage on others
  */
 static uint64_t
 zio_roundup_alloc_size(spa_t *spa, uint64_t size)
 {
 	if (size > spa->spa_min_alloc)
 		return (roundup(size, spa->spa_gcd_alloc));
 	return (spa->spa_min_alloc);
 }
 
 size_t
 zio_get_compression_max_size(enum zio_compress compress, uint64_t gcd_alloc,
     uint64_t min_alloc, size_t s_len)
 {
 	size_t d_len;
 
 	/* minimum 12.5% must be saved (legacy value, may be changed later) */
 	d_len = s_len - (s_len >> 3);
 
 	/* ZLE can't use exactly d_len bytes, it needs more, so ignore it */
 	if (compress == ZIO_COMPRESS_ZLE)
 		return (d_len);
 
 	d_len = d_len - d_len % gcd_alloc;
 
 	if (d_len < min_alloc)
 		return (BPE_PAYLOAD_SIZE);
 	return (d_len);
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize =
 	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
 	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decrypt);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zp->zp_brtwrite)
 			return (zio);
 
 		ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
 
 		if (BP_IS_EMBEDDED(bp))
 			return (zio);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (zio);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (zio);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
 		    !zp->zp_encrypt) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (zio);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	uint32_t pass = 1;
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (zio);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
 		    MIN(zp->zp_copies, spa_max_replication(spa))
 		    == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
 		abd_t *cabd = NULL;
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0)
 			psize = 0;
 		else if (compress == ZIO_COMPRESS_EMPTY)
 			psize = lsize;
 		else
 			psize = zio_compress_data(compress, zio->io_abd, &cabd,
 			    lsize,
 			    zio_get_compression_max_size(compress,
 			    spa->spa_gcd_alloc, spa->spa_min_alloc, lsize),
 			    zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			if (cabd != NULL)
 				abd_free(cabd);
 		} else if (psize <= BPE_PAYLOAD_SIZE && !zp->zp_encrypt &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			void *cbuf = abd_borrow_buf_copy(cabd, lsize);
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			abd_return_buf(cabd, cbuf, lsize);
 			abd_free(cabd);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (zio);
 		} else {
 			/*
 			 * Round compressed size up to the minimum allocation
 			 * size of the smallest-ashift device, and zero the
 			 * tail. This ensures that the compressed size of the
 			 * BP (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			size_t rounded = (size_t)zio_roundup_alloc_size(spa,
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				abd_free(cabd);
 				psize = lsize;
 			} else {
 				abd_zero_off(cabd, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cabd,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 
 	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
 	    zp->zp_type == DMU_OT_DNODE) {
 		/*
 		 * The DMU actually relies on the zio layer's compression
 		 * to free metadnode blocks that have had all contained
 		 * dnodes freed. As a result, even when doing a raw
 		 * receive, we must check whether the block can be compressed
 		 * to a hole.
 		 */
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0) {
 			psize = 0;
 			compress = ZIO_COMPRESS_OFF;
 		} else {
 			psize = lsize;
 		}
 	} else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) {
 		/*
 		 * If we are raw receiving an encrypted dataset we should not
 		 * take this codepath because it will change the on-disk block
 		 * and decryption will fail.
 		 */
 		size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
 		    lsize);
 
 		if (rounded != psize) {
 			abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
 			abd_zero_off(cdata, psize, rounded - psize);
 			abd_copy_off(cdata, zio->io_abd, 0, 0, psize);
 			psize = rounded;
 			zio_push_transform(zio, cdata,
 			    psize, rounded, NULL);
 		}
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			ASSERT(!zp->zp_encrypt ||
 			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available or cut the line otherwise.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) {
 		if (spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 			q++;
 		else
 			cutinline = B_TRUE;
 	}
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	spa_taskq_dispatch(spa, t, q, zio_execute, zio, cutinline);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	spa_t *spa = zio->io_spa;
 
 	taskq_t *tq = taskq_of_curthread();
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (tqs->stqs_taskq[i] == tq)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static zio_t *
 zio_issue_async(zio_t *zio)
 {
 	ASSERT((zio->io_type != ZIO_TYPE_WRITE) || ZIO_HAS_ALLOCATOR(zio));
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 	return (NULL);
 }
 
 void
 zio_interrupt(void *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			taskqid_t tid;
 			hrtime_t diff = zio->io_target_timestamp - now;
 			int ticks = MAX(1, NSEC_TO_TICK(diff));
 			clock_t expire_at_tick = ddi_get_lbolt() + ticks;
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			tid = taskq_dispatch_delay(system_taskq, zio_interrupt,
 			    zio, TQ_NOSLEEP, expire_at_tick);
 			if (tid == TASKQID_INVALID) {
 				/*
 				 * Couldn't allocate a task.  Just finish the
 				 * zio without a delay.
 				 */
 				zio_interrupt(zio);
 			}
 		}
 		return;
 	}
 #endif
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 static void
 zio_deadman_impl(zio_t *pio, int ziodepth)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 	vdev_t *vd = pio->io_vd;
 
 	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
 		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
 		zbookmark_phys_t *zb = &pio->io_bookmark;
 		uint64_t delta = gethrtime() - pio->io_timestamp;
 		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
 
 		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
 		    "delta=%llu queued=%llu io=%llu "
 		    "path=%s "
 		    "last=%llu type=%d "
 		    "priority=%d flags=0x%llx stage=0x%x "
 		    "pipeline=0x%x pipeline-trace=0x%x "
 		    "objset=%llu object=%llu "
 		    "level=%llu blkid=%llu "
 		    "offset=%llu size=%llu "
 		    "error=%d",
 		    ziodepth, pio, pio->io_timestamp,
 		    (u_longlong_t)delta, pio->io_delta, pio->io_delay,
 		    vd ? vd->vdev_path : "NULL",
 		    vq ? vq->vq_io_complete_ts : 0, pio->io_type,
 		    pio->io_priority, (u_longlong_t)pio->io_flags,
 		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
 		    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid,
 		    (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size,
 		    pio->io_error);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
 		    pio->io_spa, vd, zb, pio, 0);
 
 		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
 		    taskq_empty_ent(&pio->io_tqent)) {
 			zio_interrupt(pio);
 		}
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_deadman_impl(cio, ziodepth + 1);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * Log the critical information describing this zio and all of its children
  * using the zfs_dbgmsg() interface then post deadman event for the ZED.
  */
 void
 zio_deadman(zio_t *pio, const char *tag)
 {
 	spa_t *spa = pio->io_spa;
 	char *name = spa_name(spa);
 
 	if (!zfs_deadman_enabled || spa_suspended(spa))
 		return;
 
 	zio_deadman_impl(pio, 0);
 
 	switch (spa_get_deadman_failmode(spa)) {
 	case ZIO_FAILURE_MODE_WAIT:
 		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_CONTINUE:
 		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
 		break;
 
 	case ZIO_FAILURE_MODE_PANIC:
 		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
 		break;
 	}
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  * (1) the I/O completes; (2) the pipeline stalls waiting for
  * dependent child I/Os; (3) the I/O issues, so we're waiting
  * for an I/O completion interrupt; (4) the I/O is delegated by
  * vdev-level caching or aggregation; (5) the I/O is deferred
  * due to vdev-level queueing; (6) the I/O is handed off to
  * another thread.  In all cases, the pipeline stops whenever
  * there's no CPU work; it never burns a thread in cv_wait_io().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 /*
  * zio_execute() is a wrapper around the static function
  * __zio_execute() so that we can force  __zio_execute() to be
  * inlined.  This reduces stack overhead which is important
  * because __zio_execute() is called recursively in several zio
  * code paths.  zio_execute() itself cannot be inlined because
  * it is externally visible.
  */
 void
 zio_execute(void *zio)
 {
 	fstrans_cookie_t cookie;
 
 	cookie = spl_fstrans_mark();
 	__zio_execute(zio);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * Used to determine if in the current context the stack is sized large
  * enough to allow zio_execute() to be called recursively.  A minimum
  * stack size of 16K is required to avoid needing to re-dispatch the zio.
  */
 static boolean_t
 zio_execute_stack_check(zio_t *zio)
 {
 #if !defined(HAVE_LARGE_STACKS)
 	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
 
 	/* Executing in txg_sync_thread() context. */
 	if (dp && curthread == dp->dp_tx.tx_sync_thread)
 		return (B_TRUE);
 
 	/* Pool initialization outside of zio_taskq context. */
 	if (dp && spa_is_initializing(dp->dp_spa) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
 	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
 		return (B_TRUE);
 #else
 	(void) zio;
 #endif /* HAVE_LARGE_STACKS */
 
 	return (B_FALSE);
 }
 
 __attribute__((always_inline))
 static inline void
 __zio_execute(zio_t *zio)
 {
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 
 		zio->io_executor = curthread;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		/*
 		 * If the current context doesn't have large enough stacks
 		 * the zio must be issued asynchronously to prevent overflow.
 		 */
 		if (zio_execute_stack_check(zio)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 
 		/*
 		 * The zio pipeline stage returns the next zio to execute
 		 * (typically the same as this one), or NULL if we should
 		 * stop.
 		 */
 		zio = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (zio == NULL)
 			return;
 	}
 }
 
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	/*
 	 * Some routines, like zio_free_sync(), may return a NULL zio
 	 * to avoid the performance overhead of creating and then destroying
 	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
 	 * zio and ignore it.
 	 */
 	if (zio == NULL)
 		return (0);
 
 	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
 	int error;
 
 	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL) {
 		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
 		    ddi_get_lbolt() + timeout);
 
 		if (zfs_deadman_enabled && error == -1 &&
 		    gethrtime() - zio->io_queued_timestamp >
 		    spa_deadman_ziotime(zio->io_spa)) {
 			mutex_exit(&zio->io_lock);
 			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
 			zio_deadman(zio, FTAG);
 			mutex_enter(&zio->io_lock);
 		}
 	}
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	/*
 	 * See comment in zio_wait().
 	 */
 	if (zio == NULL)
 		return;
 
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    list_is_empty(&zio->io_parent_list)) {
 		zio_t *pio;
 
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
 
 		zio_add_child(pio, zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		spa_select_allocator(zio);
 	}
 	__zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(void *arg)
 {
 	zio_t *pio = arg;
 	zio_t *cio, *cio_next, *gio;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	mutex_enter(&pio->io_lock);
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	pio->io_state[ZIO_WAIT_READY] = (pio->io_stage >= ZIO_STAGE_READY) ||
 	    (pio->io_pipeline & ZIO_STAGE_READY) == 0;
 	pio->io_state[ZIO_WAIT_DONE] = (pio->io_stage >= ZIO_STAGE_DONE);
 
 	/*
 	 * It's possible for a failed ZIO to be a descendant of more than one
 	 * ZIO tree. When reexecuting it, we have to be sure to add its wait
 	 * states to all parent wait counts.
 	 *
 	 * Those parents, in turn, may have other children that are currently
 	 * active, usually because they've already been reexecuted after
 	 * resuming. Those children may be executing and may call
 	 * zio_notify_parent() at the same time as we're updating our parent's
 	 * counts. To avoid races while updating the counts, we take
 	 * gio->io_lock before each update.
 	 */
 	zio_link_t *zl = NULL;
 	while ((gio = zio_walk_parents(pio, &zl)) != NULL) {
 		mutex_enter(&gio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++) {
 			gio->io_children[pio->io_child_type][w] +=
 			    !pio->io_state[w];
 		}
 		mutex_exit(&gio->io_lock);
 	}
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 		mutex_enter(&pio->io_lock);
 	}
 	mutex_exit(&pio->io_lock);
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		__zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	if (reason != ZIO_SUSPEND_MMP) {
 		cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
 		    "I/O failure and has been suspended.", spa_name(spa));
 	}
 
 	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
 	    NULL, NULL, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = reason;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	if (spa->spa_suspended != ZIO_SUSPEND_NONE)
 		cmn_err(CE_WARN, "Pool '%s' was suspended and is being "
 		    "resumed. Failed I/O will be retried.",
 		    spa_name(spa));
 	spa->spa_suspended = ZIO_SUSPEND_NONE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_free(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 
 	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    ZIO_GANG_CHILD_FLAGS(pio));
 	if (zio == NULL) {
 		zio = zio_null(pio, pio->io_spa,
 		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
 	}
 	return (zio);
 }
 
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	(void) gn, (void) data, (void) offset;
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(list_is_empty(&zio->io_child_list));
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_free(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static zio_t *
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (zio);
 }
 
 static zio_t *
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 static void
 zio_gang_inherit_allocator(zio_t *pio, zio_t *cio)
 {
 	cio->io_allocator = pio->io_allocator;
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 	zio_t *gio __maybe_unused = zio->io_gang_leader;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
-	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+	/*
+	 * If we're getting direct-invoked from zio_write_gang_block(),
+	 * the bp_orig will be set.
+	 */
+	ASSERT(BP_IS_HOLE(&zio->io_bp_orig) ||
+	    zio->io_flags & ZIO_FLAG_PREALLOCATED);
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	/*
 	 * The io_abd field will be NULL for a zio with no data.  The io_flags
 	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
 	 * check for it here as it is cleared in zio_ready.
 	 */
 	if (zio->io_abd != NULL)
 		abd_free(zio->io_abd);
 }
 
 static zio_t *
 zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
-	uint64_t psize;
 	zio_prop_t zp;
 	int error;
 	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
 
 	/*
 	 * Store multiple copies of the GBH, so that we can still traverse
 	 * all the data (e.g. to free or scrub) even if a block is damaged.
 	 * This value respects the redundant_metadata property.
 	 */
 	int gbh_copies = gio->io_prop.zp_gang_copies;
 	if (gbh_copies == 0) {
 		/*
 		 * This should only happen in the case where we're filling in
 		 * DDT entries for a parent that wants more copies than the DDT
 		 * has.  In that case, we cannot gang without creating a mixed
 		 * blkptr, which is illegal.
 		 */
 		ASSERT3U(gio->io_child_type, ==, ZIO_CHILD_DDT);
 		pio->io_error = EAGAIN;
 		return (pio);
 	}
 	ASSERT3S(gbh_copies, >, 0);
 	ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
 
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	int flags = METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(has_data);
 
 		flags |= METASLAB_ASYNC_ALLOC;
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio->io_allocator, pio);
 	if (error) {
 		pio->io_error = error;
 		return (pio);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	memset(gbh, 0, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	zio_gang_inherit_allocator(pio, zio);
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		boolean_t more;
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies,
 		    zio, B_TRUE, &more));
 	}
 
 	/*
-	 * Create and nowait the gang children.
+	 * Create and nowait the gang children. First, we try to do
+	 * opportunistic allocations. If that fails to generate enough
+	 * space, we fall back to normal zio_write calls for nested gang.
 	 */
-	for (int g = 0; resid != 0; resid -= psize, g++) {
-		psize = zio_roundup_alloc_size(spa,
-		    resid / (SPA_GBH_NBLKPTRS - g));
-		psize = MIN(resid, psize);
-		ASSERT3U(psize, >=, SPA_MINBLOCKSIZE);
-
+	for (int g = 0; resid != 0; g++) {
+		flags &= METASLAB_ASYNC_ALLOC;
+		flags |= METASLAB_GANG_CHILD;
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_complevel = gio->io_prop.zp_complevel;
 		zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 		zp.zp_encrypt = gio->io_prop.zp_encrypt;
 		zp.zp_byteorder = gio->io_prop.zp_byteorder;
 		zp.zp_direct_write = B_FALSE;
 		memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN);
 		memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN);
 		memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN);
 
-		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
-		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
-		    resid) : NULL, psize, psize, &zp,
-		    zio_write_gang_member_ready, NULL,
+		uint64_t min_size = zio_roundup_alloc_size(spa,
+		    resid / (SPA_GBH_NBLKPTRS - g));
+		min_size = MIN(min_size, resid);
+		bp = &gbh->zg_blkptr[g];
+
+		zio_alloc_list_t cio_list;
+		metaslab_trace_init(&cio_list);
+		uint64_t allocated_size = UINT64_MAX;
+		error = metaslab_alloc_range(spa, mc, min_size, resid,
+		    bp, gio->io_prop.zp_copies, txg, NULL,
+		    flags, &cio_list, zio->io_allocator, NULL, &allocated_size);
+
+		boolean_t allocated = error == 0;
+
+		uint64_t psize = allocated ? MIN(resid, allocated_size) :
+		    min_size;
+
+		zio_t *cio = zio_write(zio, spa, txg, bp, has_data ?
+		    abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL,
+		    psize, psize, &zp, zio_write_gang_member_ready, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
-		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+		    ZIO_GANG_CHILD_FLAGS(pio) |
+		    (allocated ? ZIO_FLAG_PREALLOCATED : 0), &pio->io_bookmark);
 
+		resid -= psize;
 		zio_gang_inherit_allocator(zio, cio);
+		if (allocated) {
+			metaslab_trace_move(&cio_list, &cio->io_alloc_list);
+			metaslab_group_alloc_increment_all(spa,
+			    &cio->io_bp_orig, zio->io_allocator, flags, psize,
+			    cio);
+		}
 		/*
 		 * We do not reserve for the child writes, since we already
 		 * reserved for the parent.  Unreserve though will be called
 		 * for individual children.  We can do this since sum of all
 		 * child's physical sizes is equal to parent's physical size.
 		 * It would not work for potentially bigger allocation sizes.
 		 */
 
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (pio);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static zio_t *
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (zio);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop);
 
 		/*
 		 * If we're overwriting a block that is currently on an
 		 * indirect vdev, then ignore the nopwrite request and
 		 * allow a new block to be allocated on a concrete vdev.
 		 */
 		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
 		for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) {
 			vdev_t *tvd = vdev_lookup_top(zio->io_spa,
 			    DVA_GET_VDEV(&bp_orig->blk_dva[d]));
 			if (tvd->vdev_ops == &vdev_indirect_ops) {
 				spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 				return (zio);
 			}
 		}
 		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Block Reference Table
  * ==========================================================================
  */
 static zio_t *
 zio_brt_free(zio_t *zio)
 {
 	blkptr_t *bp;
 
 	bp = zio->io_bp;
 
 	if (BP_GET_LEVEL(bp) > 0 ||
 	    BP_IS_METADATA(bp) ||
 	    !brt_maybe_exists(zio->io_spa, bp)) {
 		return (zio);
 	}
 
 	if (!brt_entry_decref(zio->io_spa, bp)) {
 		/*
 		 * This isn't the last reference, so we cannot free
 		 * the data yet.
 		 */
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddt = ddt_select(zio->io_spa, bp);
 
 	if (zio->io_error == 0) {
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 		/* this phys variant doesn't need repair */
 		ddt_phys_clear(dde->dde_phys, v);
 	}
 
 	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
 		dde->dde_io->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static zio_t *
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
 		ddt_univ_phys_t *ddp = dde->dde_phys;
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (v_self == DDT_PHYS_NONE)
 			return (zio);
 
 		/* issue I/O for the other copies */
 		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 			ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 			if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
 				continue;
 
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
 			    ddp, v, &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (zio);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (zio);
 }
 
 static zio_t *
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (zio);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
 		if (dde->dde_io->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (zio);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
 
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 * However, we should never get a raw, override zio so in these
 	 * cases we can compare the io_abd directly. This is useful because
 	 * it allows us to do dedup verification even if we don't have access
 	 * to the original data (for instance, if the encryption keys aren't
 	 * loaded).
 	 */
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
 		if (dde->dde_io == NULL)
 			continue;
 
 		zio_t *lio = dde->dde_io->dde_lead_zio[p];
 		if (lio == NULL)
 			continue;
 
 		if (do_raw)
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
 
 		return (lio->io_orig_size != zio->io_orig_size ||
 		    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 	}
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
 
 		if (phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
 			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			tmpabd = abd_alloc_for_io(psize, B_TRUE);
 
 			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
 			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_RAW, &zio->io_bookmark));
 
 			if (error == 0) {
 				if (abd_cmp(tmpabd, zio->io_abd) != 0)
 					error = SET_ERROR(ENOENT);
 			}
 
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
 		} else if (phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
 
 			ddt_exit(ddt);
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(ENOENT);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
 	zio_link_t *zl = NULL;
 	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	ddt_enter(ddt);
 
 	/* we're the lead, so once we're done there's no one else outstanding */
 	if (dde->dde_io->dde_lead_zio[p] == zio)
 		dde->dde_io->dde_lead_zio[p] = NULL;
 
 	ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
 
 	if (zio->io_error != 0) {
 		/*
 		 * The write failed, so we're about to abort the entire IO
 		 * chain. We need to revert the entry back to what it was at
 		 * the last time it was successfully extended.
 		 */
 		ddt_phys_unextend(ddp, orig, v);
 		ddt_phys_clear(orig, v);
 
 		ddt_exit(ddt);
 		return;
 	}
 
 	/*
 	 * Add references for all dedup writes that were waiting on the
 	 * physical one, skipping any other physical writes that are waiting.
 	 */
 	zio_t *pio;
 	zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
 			ddt_phys_addref(ddp, v);
 	}
 
 	/*
 	 * We've successfully added new DVAs to the entry. Clear the saved
 	 * state or, if there's still outstanding IO, remember it so we can
 	 * revert to a known good state if that IO fails.
 	 */
 	if (dde->dde_io->dde_lead_zio[p] == NULL)
 		ddt_phys_clear(orig, v);
 	else
 		ddt_phys_copy(orig, ddp, v);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
 	zio_link_t *zl = NULL;
 	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 	if (ddt_phys_is_gang(dde->dde_phys, v)) {
 		for (int i = 0; i < BP_GET_NDVAS(zio->io_bp); i++) {
 			dva_t *d = &zio->io_bp->blk_dva[i];
 			metaslab_group_alloc_decrement(zio->io_spa,
 			    DVA_GET_VDEV(d), zio->io_allocator,
 			    METASLAB_ASYNC_ALLOC, zio->io_size, zio);
 		}
 		zio->io_error = EAGAIN;
 	}
 
 	if (zio->io_error != 0)
 		return;
 
 	ddt_enter(ddt);
 
 	ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
 
 	zio_t *pio;
 	zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
 			ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
 	}
 
 	ddt_exit(ddt);
 }
 
 static zio_t *
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 	/*
 	 * Deduplication will not take place for Direct I/O writes. The
 	 * ddt_tree will be emptied in syncing context. Direct I/O writes take
 	 * place in the open-context. Direct I/O write can not attempt to
 	 * modify the ddt_tree while issuing out a write.
 	 */
 	ASSERT3B(zio->io_prop.zp_direct_write, ==, B_FALSE);
 
 	ddt_enter(ddt);
 	/*
 	 * Search DDT for matching entry.  Skip DVAs verification here, since
 	 * they can go only from override, and once we get here the override
 	 * pointer can't have "D" flag to be confused with pruned DDT entries.
 	 */
 	IMPLY(zio->io_bp_override, !BP_GET_DEDUP(zio->io_bp_override));
 	dde = ddt_lookup(ddt, bp, B_FALSE);
 	if (dde == NULL) {
 		/* DDT size is over its quota so no new entries */
 		zp->zp_dedup = B_FALSE;
 		BP_SET_DEDUP(bp, B_FALSE);
 		if (zio->io_bp_override == NULL)
 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
 	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	/*
 	 * In the common cases, at this point we have a regular BP with no
 	 * allocated DVAs, and the corresponding DDT entry for its checksum.
 	 * Our goal is to fill the BP with enough DVAs to satisfy its copies=
 	 * requirement.
 	 *
 	 * One of three things needs to happen to fulfill this:
 	 *
 	 * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
 	 *   them out of the entry and return;
 	 *
 	 * - if the DDT entry has no DVAs (ie its brand new), then we have to
 	 *   issue the write as normal so that DVAs can be allocated and the
 	 *   data land on disk. We then copy the DVAs into the DDT entry on
 	 *   return.
 	 *
 	 * - if the DDT entry has some DVAs, but too few, we have to issue the
 	 *   write, adjusted to have allocate fewer copies. When it returns, we
 	 *   add the new DVAs to the DDT entry, and update the BP to have the
 	 *   full amount it originally requested.
 	 *
 	 * In all cases, if there's already a writing IO in flight, we need to
 	 * defer the action until after the write is done. If our action is to
 	 * write, we need to adjust our request for additional DVAs to match
 	 * what will be in the DDT entry after it completes. In this way every
 	 * IO can be guaranteed to recieve enough DVAs simply by joining the
 	 * end of the chain and letting the sequence play out.
 	 */
 
 	/*
 	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
 	 * the third one as normal.
 	 */
 	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
 	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
 	boolean_t is_ganged = ddt_phys_is_gang(ddp, v);
 
 	/* Number of DVAs requested by the IO. */
 	uint8_t need_dvas = zp->zp_copies;
 	/* Number of DVAs in outstanding writes for this dde. */
 	uint8_t parent_dvas = 0;
 
 	/*
 	 * What we do next depends on whether or not there's IO outstanding that
 	 * will update this entry.
 	 */
 	if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
 		/*
 		 * No IO outstanding, so we only need to worry about ourselves.
 		 */
 
 		/*
 		 * Override BPs bring their own DVAs and their own problems.
 		 */
 		if (zio->io_bp_override) {
 			/*
 			 * For a brand-new entry, all the work has been done
 			 * for us, and we can just fill it out from the provided
 			 * block and leave.
 			 */
 			if (have_dvas == 0) {
 				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 				ddt_phys_extend(ddp, v, bp);
 				ddt_phys_addref(ddp, v);
 				ddt_exit(ddt);
 				return (zio);
 			}
 
 			/*
 			 * If we already have this entry, then we want to treat
 			 * it like a regular write. To do this we just wipe
 			 * them out and proceed like a regular write.
 			 *
 			 * Even if there are some DVAs in the entry, we still
 			 * have to clear them out. We can't use them to fill
 			 * out the dedup entry, as they are all referenced
 			 * together by a bp already on disk, and will be freed
 			 * as a group.
 			 */
 			BP_ZERO_DVAS(bp);
 			BP_SET_BIRTH(bp, 0, 0);
 		}
 
 		/*
 		 * If there are enough DVAs in the entry to service our request,
 		 * then we can just use them as-is.
 		 */
 		if (have_dvas >= need_dvas) {
 			ddt_bp_fill(ddp, v, bp, txg);
 			ddt_phys_addref(ddp, v);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * Otherwise, we have to issue IO to fill the entry up to the
 		 * amount we need.
 		 */
 		need_dvas -= have_dvas;
 	} else {
 		/*
 		 * There's a write in-flight. If there's already enough DVAs on
 		 * the entry, then either there were already enough to start
 		 * with, or the in-flight IO is between READY and DONE, and so
 		 * has extended the entry with new DVAs. Either way, we don't
 		 * need to do anything, we can just slot in behind it.
 		 */
 
 		if (zio->io_bp_override) {
 			/*
 			 * If there's a write out, then we're soon going to
 			 * have our own copies of this block, so clear out the
 			 * override block and treat it as a regular dedup
 			 * write. See comment above.
 			 */
 			BP_ZERO_DVAS(bp);
 			BP_SET_BIRTH(bp, 0, 0);
 		}
 
 		if (have_dvas >= need_dvas) {
 			/*
 			 * A minor point: there might already be enough
 			 * committed DVAs in the entry to service our request,
 			 * but we don't know which are completed and which are
 			 * allocated but not yet written. In this case, should
 			 * the IO for the new DVAs fail, we will be on the end
 			 * of the IO chain and will also recieve an error, even
 			 * though our request could have been serviced.
 			 *
 			 * This is an extremely rare case, as it requires the
 			 * original block to be copied with a request for a
 			 * larger number of DVAs, then copied again requesting
 			 * the same (or already fulfilled) number of DVAs while
 			 * the first request is active, and then that first
 			 * request errors. In return, the logic required to
 			 * catch and handle it is complex. For now, I'm just
 			 * not going to bother with it.
 			 */
 
 			/*
 			 * We always fill the bp here as we may have arrived
 			 * after the in-flight write has passed READY, and so
 			 * missed out.
 			 */
 			ddt_bp_fill(ddp, v, bp, txg);
 			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * There's not enough in the entry yet, so we need to look at
 		 * the write in-flight and see how many DVAs it will have once
 		 * it completes.
 		 *
 		 * The in-flight write has potentially had its copies request
 		 * reduced (if we're filling out an existing entry), so we need
 		 * to reach in and get the original write to find out what it is
 		 * expecting.
 		 *
 		 * Note that the parent of the lead zio will always have the
 		 * highest zp_copies of any zio in the chain, because ones that
 		 * can be serviced without additional IO are always added to
 		 * the back of the chain.
 		 */
 		zio_link_t *zl = NULL;
 		zio_t *pio =
 		    zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
 		ASSERT(pio);
 		parent_dvas = pio->io_prop.zp_copies;
 
 		if (parent_dvas >= need_dvas) {
 			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 			ddt_exit(ddt);
 			return (zio);
 		}
 
 		/*
 		 * Still not enough, so we will need to issue to get the
 		 * shortfall.
 		 */
 		need_dvas -= parent_dvas;
 	}
 
 	if (is_ganged) {
 		zp->zp_dedup = B_FALSE;
 		BP_SET_DEDUP(bp, B_FALSE);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (zio);
 	}
 
 	/*
 	 * We need to write. We will create a new write with the copies
 	 * property adjusted to match the number of DVAs we need to need to
 	 * grow the DDT entry by to satisfy the request.
 	 */
 	zio_prop_t czp = *zp;
 	if (have_dvas > 0 || parent_dvas > 0) {
 		czp.zp_copies = need_dvas;
 		czp.zp_gang_copies = 0;
 	} else {
 		ASSERT3U(czp.zp_copies, ==, need_dvas);
 	}
 
 	zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 	    zio->io_orig_size, zio->io_orig_size, &czp,
 	    zio_ddt_child_write_ready, NULL,
 	    zio_ddt_child_write_done, dde, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 
 	/*
 	 * We are the new lead zio, because our parent has the highest
 	 * zp_copies that has been requested for this entry so far.
 	 */
 	ddt_alloc_entry_io(dde);
 	if (dde->dde_io->dde_lead_zio[p] == NULL) {
 		/*
 		 * First time out, take a copy of the stable entry to revert
 		 * to if there's an error (see zio_ddt_child_write_done())
 		 */
 		ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
 	} else {
 		/*
 		 * Make the existing chain our child, because it cannot
 		 * complete until we have.
 		 */
 		zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
 	}
 	dde->dde_io->dde_lead_zio[p] = cio;
 
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
 
 	return (zio);
 }
 
 static ddt_entry_t *freedde; /* for debugging */
 
 static zio_t *
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde = NULL;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	if (dde) {
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 		if (v != DDT_PHYS_NONE)
 			ddt_phys_decref(dde->dde_phys, v);
 	}
 	ddt_exit(ddt);
 
 	/*
 	 * When no entry was found, it must have been pruned,
 	 * so we can free it now instead of decrementing the
 	 * refcount in the DDT.
 	 */
 	if (!dde) {
 		BP_SET_DEDUP(bp, 0);
 		zio->io_pipeline |= ZIO_STAGE_DVA_FREE;
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(metaslab_class_allocator_t *mca, boolean_t *more)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&mca->mca_lock));
 
 	zio = avl_first(&mca->mca_tree);
 	if (zio == NULL) {
 		*more = B_FALSE;
 		return (NULL);
 	}
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
 	    zio->io_prop.zp_copies, zio, B_FALSE, more)) {
 		return (NULL);
 	}
 
 	avl_remove(&mca->mca_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	if (avl_is_empty(&mca->mca_tree))
 		*more = B_FALSE;
 	return (zio);
 }
 
 static zio_t *
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 	metaslab_class_t *mc;
 	boolean_t more;
 
 	/*
 	 * If not already chosen, choose an appropriate allocation class.
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL)
 		mc = spa_preferred_class(spa, zio);
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !mc->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (zio);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	zio->io_metaslab_class = mc;
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator];
 	mutex_enter(&mca->mca_lock);
 	avl_add(&mca->mca_tree, zio);
 	nio = zio_io_to_allocate(mca, &more);
 	mutex_exit(&mca->mca_lock);
 	return (nio);
 }
 
 static void
 zio_allocate_dispatch(metaslab_class_t *mc, int allocator)
 {
 	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
 	zio_t *zio;
 	boolean_t more;
 
 	do {
 		mutex_enter(&mca->mca_lock);
 		zio = zio_io_to_allocate(mca, &more);
 		mutex_exit(&mca->mca_lock);
 		if (zio == NULL)
 			return;
 
 		ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 		ASSERT0(zio->io_error);
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 	} while (more);
 }
 
 static zio_t *
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
+	if (zio->io_flags & ZIO_FLAG_PREALLOCATED) {
+		ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_GANG);
+		memcpy(zio->io_bp->blk_dva, zio->io_bp_orig.blk_dva,
+		    3 * sizeof (dva_t));
+		BP_SET_BIRTH(zio->io_bp, BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig),
+		    BP_GET_PHYSICAL_BIRTH(&zio->io_bp_orig));
+		return (zio);
+	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
 		flags |= METASLAB_GANG_CHILD;
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
 		flags |= METASLAB_ASYNC_ALLOC;
 
 	/*
 	 * If not already chosen, choose an appropriate allocation class.
 	 */
 	mc = zio->io_metaslab_class;
 	if (mc == NULL) {
 		mc = spa_preferred_class(spa, zio);
 		zio->io_metaslab_class = mc;
 	}
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 
 again:
 	/*
 	 * Try allocating the block in the usual metaslab class.
 	 * If that's full, allocate it in the normal class.
 	 * If that's full, allocate as a gang block,
 	 * and if all are full, the allocation fails (which shouldn't happen).
 	 *
 	 * Note that we do not fall back on embedded slog (ZIL) space, to
 	 * preserve unfragmented slog space, which is critical for decent
 	 * sync write performance.  If a log allocation fails, we will fall
 	 * back to spa_sync() which is abysmal for performance.
 	 */
 	ASSERT(ZIO_HAS_ALLOCATOR(zio));
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio->io_allocator, zio);
 
 	/*
 	 * Fallback to normal class when an alloc class is full
 	 */
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		/*
 		 * When the dedup or special class is spilling into the  normal
 		 * class, there can still be significant space available due
 		 * to deferred frees that are in-flight.  We track the txg when
 		 * this occurred and back off adding new DDT entries for a few
 		 * txgs to allow the free blocks to be processed.
 		 */
 		if ((mc == spa_dedup_class(spa) || (spa_special_has_ddt(spa) &&
 		    mc == spa_special_class(spa))) &&
 		    spa->spa_dedup_class_full_txg != zio->io_txg) {
 			spa->spa_dedup_class_full_txg = zio->io_txg;
 			zfs_dbgmsg("%s[%d]: %s class spilling, req size %d, "
 			    "%llu allocated of %llu",
 			    spa_name(spa), (int)zio->io_txg,
 			    mc == spa_dedup_class(spa) ? "dedup" : "special",
 			    (int)zio->io_size,
 			    (u_longlong_t)metaslab_class_get_alloc(mc),
 			    (u_longlong_t)metaslab_class_get_space(mc));
 		}
 
 		/*
 		 * If we are holding old class reservation, drop it.
 		 * Dispatch the next ZIO(s) there if some are waiting.
 		 */
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			if (metaslab_class_throttle_unreserve(mc,
 			    zio->io_prop.zp_copies, zio)) {
 				zio_allocate_dispatch(zio->io_metaslab_class,
 				    zio->io_allocator);
 			}
 			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
 		}
 
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying normal class: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_metaslab_class = mc = spa_normal_class(spa);
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
 
 		/*
 		 * If normal class uses throttling, return to that pipeline
 		 * stage.  Otherwise just do another allocation attempt.
 		 */
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    mc->mc_alloc_throttle_enabled &&
 		    zio->io_child_type != ZIO_CHILD_GANG &&
 		    !(zio->io_flags & ZIO_FLAG_NODATA)) {
 			zio->io_stage = ZIO_STAGE_DVA_THROTTLE >> 1;
 			return (zio);
 		}
 		goto again;
 	}
 
 	if (error == ENOSPC && zio->io_size > spa->spa_min_alloc) {
 		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
 			zfs_dbgmsg("%s: metaslab allocation failure, "
 			    "trying ganging: zio %px, size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		ZIOSTAT_BUMP(ziostat_gang_writes);
 		if (flags & METASLAB_GANG_CHILD)
 			ZIOSTAT_BUMP(ziostat_gang_multilevel);
 		return (zio_write_gang_block(zio, mc));
 	}
 	if (error != 0) {
 		if (error != ENOSPC ||
 		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
 			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
 			    "size %llu, error %d",
 			    spa_name(spa), zio, (u_longlong_t)zio->io_size,
 			    error);
 		}
 		zio->io_error = error;
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (zio);
 }
 
 static zio_t *
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (zio);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp)) {
 		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
 		    B_TRUE);
 	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 
 	/*
 	 * Block pointer fields are useful to metaslabs for stats and debugging.
 	 * Fill in the obvious ones before calling into metaslab_alloc().
 	 */
 	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 	BP_SET_PSIZE(new_bp, size);
 	BP_SET_LEVEL(new_bp, 0);
 
 	/*
 	 * When allocating a zil block, we don't have information about
 	 * the final destination of the block except the objset it's part
 	 * of, so we just hash the objset ID to pick the allocator to get
 	 * some parallelism.
 	 */
 	int flags = METASLAB_ZIL;
 	int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object)
 	    % spa->spa_alloc_count;
 	ZIOSTAT_BUMP(ziostat_total_allocations);
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, NULL, flags, &io_alloc_list, allocator, NULL);
 	*slog = (error == 0);
 	if (error != 0) {
 		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
 		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
 		    NULL);
 	}
 	if (error != 0) {
 		ZIOSTAT_BUMP(ziostat_alloc_class_fallbacks);
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, NULL, flags, &io_alloc_list, allocator,
 		    NULL);
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 
 		/*
 		 * encrypted blocks will require an IV and salt. We generate
 		 * these now since we will not be rewriting the bp at
 		 * rewrite time.
 		 */
 		if (os->os_encrypted) {
 			uint8_t iv[ZIO_DATA_IV_LEN];
 			uint8_t salt[ZIO_DATA_SALT_LEN];
 
 			BP_SET_CRYPT(new_bp, B_TRUE);
 			VERIFY0(spa_crypt_get_salt(spa,
 			    dmu_objset_id(os), salt));
 			VERIFY0(zio_crypt_generate_iv(iv));
 
 			zio_crypt_encode_params_bp(new_bp, salt, iv);
 		}
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), (u_longlong_t)size,
 		    error);
 	}
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * Read and write to physical devices
  * ==========================================================================
  */
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static zio_t *
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 
 	zio->io_delay = 0;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (NULL);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(spa->spa_trust_config);
 
 		/*
 		 * Note: the code can handle other kinds of writes,
 		 * but we don't expect them.
 		 */
 		if (zio->io_vd->vdev_noalloc) {
 			ASSERT(zio->io_flags &
 			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
 			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
 		}
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For physical writes, we allow 512b aligned writes and assume
 		 * the device will perform a read-modify-write as necessary.
 		 */
 		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
 		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
 	}
 
 	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering.
 	 *
 	 * There are a few ways that we can end up creating these spurious
 	 * resilver i/os:
 	 *
 	 * 1. A resilver i/o will be issued if any DVA in the BP has a
 	 * dirty DTL.  The mirror code will issue resilver writes to
 	 * each DVA, including the one(s) that are not on vdevs with dirty
 	 * DTLs.
 	 *
 	 * 2. With nested replication, which happens when we have a
 	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
 	 * For example, given mirror(replacing(A+B), C), it's likely that
 	 * only A is out of date (it's the new device). In this case, we'll
 	 * read from C, then use the data to resilver A+B -- but we don't
 	 * actually want to resilver B, just A. The top-level mirror has no
 	 * way to know this, so instead we just discard unnecessary repairs
 	 * as we work our way down the vdev tree.
 	 *
 	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
 	 * The same logic applies to any form of nested replication: ditto
 	 * + mirror, RAID-Z + replacing, etc.
 	 *
 	 * However, indirect vdevs point off to other vdevs which may have
 	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
 	 * will be properly bypassed instead.
 	 *
 	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
 	 * a dRAID spare vdev. For example, when a dRAID spare is first
 	 * used, its spare blocks need to be written to but the leaf vdev's
 	 * of such blocks can have empty DTL_PARTIAL.
 	 *
 	 * There seemed no clean way to allow such writes while bypassing
 	 * spurious ones. At this point, just avoid all bypassing for dRAID
 	 * for correctness.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    vd->vdev_ops != &vdev_indirect_ops &&
 	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (zio);
 	}
 
 	/*
 	 * Select the next best leaf I/O to process.  Distributed spares are
 	 * excluded since they dispatch the I/O directly to a leaf vdev after
 	 * applying the dRAID mapping.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops &&
 	    (zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_TRIM)) {
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (NULL);
 
 		if (!vdev_accessible(vd, zio)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return (NULL);
 		}
 		zio->io_delay = gethrtime();
 
 		if (zio_handle_device_injection(vd, zio, ENOSYS) != 0) {
 			/*
 			 * "no-op" injections return success, but do no actual
 			 * work. Just return it.
 			 */
 			zio_delay_interrupt(zio);
 			return (NULL);
 		}
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (NULL);
 }
 
 static zio_t *
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_FLUSH ||
 	    zio->io_type == ZIO_TYPE_TRIM);
 
 	if (zio->io_delay)
 		zio->io_delay = gethrtime() - zio->io_delay;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_ops != &vdev_draid_spare_ops) {
 		if (zio->io_type != ZIO_TYPE_FLUSH)
 			vdev_queue_io_done(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injections(vd, zio,
 			    EIO, EILSEQ);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error && zio->io_type != ZIO_TYPE_FLUSH &&
 		    zio->io_type != ZIO_TYPE_TRIM) {
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error && vd->vdev_remove_wanted == B_FALSE)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (zio);
 }
 
 /*
  * This function is used to change the priority of an existing zio that is
  * currently in-flight. This is used by the arc to upgrade priority in the
  * event that a demand read is made for a block that is currently queued
  * as a scrub or async read IO. Otherwise, the high priority read request
  * would end up having to wait for the lower priority IO.
  */
 void
 zio_change_priority(zio_t *pio, zio_priority_t priority)
 {
 	zio_t *cio, *cio_next;
 	zio_link_t *zl = NULL;
 
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_change_io_priority(pio, priority);
 	} else {
 		pio->io_priority = priority;
 	}
 
 	mutex_enter(&pio->io_lock);
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		zio_change_priority(cio, priority);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const abd_t *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr)
 {
 	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
 
 	abd_copy(abd, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = abd;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_abd_free;
 }
 
 static zio_t *
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	/*
 	 * If a Direct I/O operation has a checksum verify error then this I/O
 	 * should not attempt to be issued again.
 	 */
 	if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) {
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 			ASSERT3U(zio->io_error, ==, EIO);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		return (zio);
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (NULL);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting "
 		    "cant_write=TRUE due to write failure with ENXIO",
 		    zio);
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
 	 * boolean flag so that we don't bother with it in the future, and
 	 * then we act like the flush succeeded.
 	 */
 	if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FLUSH &&
 	    vd != NULL) {
 		vd->vdev_nowritecache = B_TRUE;
 		zio->io_error = 0;
 	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (zio);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Encrypt and store encryption parameters
  * ==========================================================================
  */
 
 
 /*
  * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
  * managing the storage of encryption parameters and passing them to the
  * lower-level encryption functions.
  */
 static zio_t *
 zio_encrypt(zio_t *zio)
 {
 	zio_prop_t *zp = &zio->io_prop;
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_GET_PSIZE(bp);
 	uint64_t dsobj = zio->io_bookmark.zb_objset;
 	dmu_object_type_t ot = BP_GET_TYPE(bp);
 	void *enc_buf = NULL;
 	abd_t *eabd = NULL;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/* the root zio already encrypted the data */
 	if (zio->io_child_type == ZIO_CHILD_GANG)
 		return (zio);
 
 	/* only ZIL blocks are re-encrypted on rewrite */
 	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
 		return (zio);
 
 	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
 		BP_SET_CRYPT(bp, B_FALSE);
 		return (zio);
 	}
 
 	/* if we are doing raw encryption set the provided encryption params */
 	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
 		ASSERT0(BP_GET_LEVEL(bp));
 		BP_SET_CRYPT(bp, B_TRUE);
 		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
 		if (ot != DMU_OT_OBJSET)
 			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
 
 		/* dnode blocks must be written out in the provided byteorder */
 		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
 		    ot == DMU_OT_DNODE) {
 			void *bswap_buf = zio_buf_alloc(psize);
 			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
 
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
 			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
 			    psize);
 
 			abd_take_ownership_of_buf(babd, B_TRUE);
 			zio_push_transform(zio, babd, psize, psize, NULL);
 		}
 
 		if (DMU_OT_IS_ENCRYPTED(ot))
 			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
 		return (zio);
 	}
 
 	/* indirect blocks only maintain a cksum of the lower level MACs */
 	if (BP_GET_LEVEL(bp) > 0) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
 		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
 		    mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Objset blocks are a special case since they have 2 256-bit MACs
 	 * embedded within them.
 	 */
 	if (ot == DMU_OT_OBJSET) {
 		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
 		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
 		return (zio);
 	}
 
 	/* unencrypted object types are only authenticated with a MAC */
 	if (!DMU_OT_IS_ENCRYPTED(ot)) {
 		BP_SET_CRYPT(bp, B_TRUE);
 		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
 		    zio->io_abd, psize, mac));
 		zio_crypt_encode_mac_bp(bp, mac);
 		return (zio);
 	}
 
 	/*
 	 * Later passes of sync-to-convergence may decide to rewrite data
 	 * in place to avoid more disk reallocations. This presents a problem
 	 * for encryption because this constitutes rewriting the new data with
 	 * the same encryption key and IV. However, this only applies to blocks
 	 * in the MOS (particularly the spacemaps) and we do not encrypt the
 	 * MOS. We assert that the zio is allocating or an intent log write
 	 * to enforce this.
 	 */
 	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
 	ASSERT3U(psize, !=, 0);
 
 	enc_buf = zio_buf_alloc(psize);
 	eabd = abd_get_from_buf(enc_buf, psize);
 	abd_take_ownership_of_buf(eabd, B_TRUE);
 
 	/*
 	 * For an explanation of what encryption parameters are stored
 	 * where, see the block comment in zio_crypt.c.
 	 */
 	if (ot == DMU_OT_INTENT_LOG) {
 		zio_crypt_decode_params_bp(bp, salt, iv);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 	}
 
 	/* Perform the encryption. This should not fail */
 	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
 	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
 
 	/* encode encryption metadata into the bp */
 	if (ot == DMU_OT_INTENT_LOG) {
 		/*
 		 * ZIL blocks store the MAC in the embedded checksum, so the
 		 * transform must always be applied.
 		 */
 		zio_crypt_encode_mac_zil(enc_buf, mac);
 		zio_push_transform(zio, eabd, psize, psize, NULL);
 	} else {
 		BP_SET_CRYPT(bp, B_TRUE);
 		zio_crypt_encode_params_bp(bp, salt, iv);
 		zio_crypt_encode_mac_bp(bp, mac);
 
 		if (no_crypt) {
 			ASSERT3U(ot, ==, DMU_OT_DNODE);
 			abd_free(eabd);
 		} else {
 			zio_push_transform(zio, eabd, psize, psize, NULL);
 		}
 	}
 
 	return (zio);
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static zio_t *
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (zio);
 }
 
 static zio_t *
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (zio);
 
 		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
 	}
 
 	ASSERT0(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 	IMPLY(zio->io_flags & ZIO_FLAG_DIO_READ,
 	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE));
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			if (zio->io_flags & ZIO_FLAG_DIO_READ) {
 				zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 				zio_t *pio = zio_unique_parent(zio);
 				/*
 				 * Any Direct I/O read that has a checksum
 				 * error must be treated as suspicous as the
 				 * contents of the buffer could be getting
 				 * manipulated while the I/O is taking place.
 				 *
 				 * The checksum verify error will only be
 				 * reported here for disk and file VDEV's and
 				 * will be reported on those that the failure
 				 * occurred on. Other types of VDEV's report the
 				 * verify failure in their own code paths.
 				 */
 				if (pio->io_child_type == ZIO_CHILD_LOGICAL) {
 					zio_dio_chksum_verify_error_report(zio);
 				}
 			} else {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_checksum_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 				(void) zfs_ereport_start_checksum(zio->io_spa,
 				    zio->io_vd, &zio->io_bookmark, zio,
 				    zio->io_offset, zio->io_size, &info);
 			}
 		}
 	}
 
 	return (zio);
 }
 
 static zio_t *
 zio_dio_checksum_verify(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	int error;
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE);
 	ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL);
 
 	if (zfs_vdev_direct_write_verify == 0 || zio->io_error != 0)
 		goto out;
 
 	if ((error = zio_checksum_error(zio, NULL)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM) {
 			zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR;
 			zio_dio_chksum_verify_error_report(zio);
 		}
 	}
 
 out:
 	return (zio);
 }
 
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * Report Direct I/O checksum verify error and create ZED event.
  */
 void
 zio_dio_chksum_verify_error_report(zio_t *zio)
 {
 	ASSERT(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 		return;
 
 	mutex_enter(&zio->io_vd->vdev_stat_lock);
 	zio->io_vd->vdev_stat.vs_dio_verify_errors++;
 	mutex_exit(&zio->io_vd->vdev_stat_lock);
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * Convert checksum error for writes into EIO.
 		 */
 		zio->io_error = SET_ERROR(EIO);
 		/*
 		 * Report dio_verify_wr ZED event.
 		 */
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_WR,
 		    zio->io_spa,  zio->io_vd, &zio->io_bookmark, zio, 0);
 	} else {
 		/*
 		 * Report dio_verify_rd ZED event.
 		 */
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_DIO_VERIFY_RD,
 		    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 	}
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static zio_t *
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
 		return (NULL);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
 		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 #ifdef ZFS_DEBUG
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 #endif
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(zio->io_metaslab_class != NULL);
 			ASSERT(ZIO_HAS_ALLOCATOR(zio));
 
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			if (metaslab_class_throttle_unreserve(
 			    zio->io_metaslab_class, zio->io_prop.zp_copies,
 			    zio)) {
 				zio_allocate_dispatch(zio->io_metaslab_class,
 				    zio->io_allocator);
 			}
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (bp != NULL && BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (zio);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 	const void *tag = pio;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that allocated
 	 * the gang header (will have ZIO_FLAG_IO_REWRITE set) and ones that
 	 * allocated the constituent blocks.  The first use their parent as tag.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG &&
 	    (pio->io_flags & ZIO_FLAG_IO_REWRITE))
 		tag = zio_unique_parent(pio);
 
 	ASSERT(IO_IS_ALLOCATING(pio) || (pio->io_child_type == ZIO_CHILD_GANG &&
 	    (pio->io_flags & ZIO_FLAG_IO_REWRITE)));
 	ASSERT(ZIO_HAS_ALLOCATOR(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 	ASSERT(zio->io_metaslab_class != NULL);
 	ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
 
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id,
 	    pio->io_allocator, flags, pio->io_size, tag);
 
 	if (metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio)) {
 		zio_allocate_dispatch(zio->io_metaslab_class,
 		    pio->io_allocator);
 	}
 }
 
 static zio_t *
 zio_done(zio_t *zio)
 {
 	/*
 	 * Always attempt to keep stack usage minimal here since
 	 * we can be called recursively up to 19 levels deep.
 	 */
 	const uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (NULL);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV)
 		zio_dva_throttle_done(zio);
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
 		ASSERT(zio->io_bp->blk_pad[0] == 0);
 		ASSERT(zio->io_bp->blk_pad[1] == 0);
 		ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy,
 		    sizeof (blkptr_t)) == 0 ||
 		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT3U(zio->io_prop.zp_copies, <=,
 			    BP_GET_NDVAS(zio->io_bp));
 			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
 			    (BP_COUNT_GANG(zio->io_bp) ==
 			    BP_GET_NDVAS(zio->io_bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			abd_t *adata = zio->io_abd;
 
 			if (adata != NULL && asize != psize) {
 				adata = abd_alloc(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, adata);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL && asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	/*
 	 * If this I/O is attached to a particular vdev is slow, exceeding
 	 * 30 seconds to complete, post an error described the I/O delay.
 	 * We ignore these errors if the device is currently unavailable.
 	 */
 	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
 		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
 			/*
 			 * We want to only increment our slow IO counters if
 			 * the IO is valid (i.e. not if the drive is removed).
 			 *
 			 * zfs_ereport_post() will also do these checks, but
 			 * it can also ratelimit and have other failures, so we
 			 * need to increment the slow_io counters independent
 			 * of it.
 			 */
 			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
 			    zio->io_spa, zio->io_vd, zio)) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				zio->io_vd->vdev_stat.vs_slow_ios++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 
 				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
 				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
 				    zio, 0);
 			}
 		}
 	}
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
 		    !vdev_is_dead(zio->io_vd) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
 			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
 			if (ret != EALREADY) {
 				mutex_enter(&zio->io_vd->vdev_stat_lock);
 				if (zio->io_type == ZIO_TYPE_READ)
 					zio->io_vd->vdev_stat.vs_read_errors++;
 				else if (zio->io_type == ZIO_TYPE_WRITE)
 					zio->io_vd->vdev_stat.vs_write_errors++;
 				mutex_exit(&zio->io_vd->vdev_stat_lock);
 			}
 		}
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) &&
 		    zio == zio->io_logical) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
 			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
 	}
 
 	if (zio->io_error && zio == zio->io_logical) {
 
 		/*
 		 * A DDT child tried to create a mixed gang/non-gang BP. We're
 		 * going to have to just retry as a non-dedup IO.
 		 */
 		if (zio->io_error == EAGAIN && IO_IS_ALLOCATING(zio) &&
 		    zio->io_prop.zp_dedup) {
 			zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			zio->io_prop.zp_dedup = B_FALSE;
 		}
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL) &&
 		    !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
 
 	if (zio->io_reexecute) {
 		/*
 		 * A Direct I/O operation that has a checksum verify error
 		 * should not attempt to reexecute. Instead, the error should
 		 * just be propagated back.
 		 */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR));
 
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				/*
 				 * This is a rare code path, so we don't
 				 * bother with "next_to_execute".
 				 */
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
 				    NULL);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			/*
 			 * This is a rare code path, so we don't bother with
 			 * "next_to_execute".
 			 */
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 			spa_taskq_dispatch(zio->io_spa,
 			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
 			    zio_reexecute, zio, B_FALSE);
 		}
 		return (NULL);
 	}
 
 	ASSERT(list_is_empty(&zio->io_child_list));
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * We are done executing this zio.  We may want to execute a parent
 	 * next.  See the comment in zio_notify_parent().
 	 */
 	zio_t *next_to_execute = NULL;
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (next_to_execute);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_encrypt,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_brt_free,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_dio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
 	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT0(last_block->zb_level);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
 
 /*
  * This function is similar to zbookmark_subtree_completed(), but returns true
  * if subtree_root is equal or ahead of last_block, i.e. still to be done.
  */
 boolean_t
 zbookmark_subtree_tbd(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	ASSERT0(last_block->zb_level);
 	if (dnp == NULL)
 		return (B_FALSE);
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root,
 	    last_block) >= 0);
 }
 
 EXPORT_SYMBOL(zio_type_name);
 EXPORT_SYMBOL(zio_buf_alloc);
 EXPORT_SYMBOL(zio_data_buf_alloc);
 EXPORT_SYMBOL(zio_buf_free);
 EXPORT_SYMBOL(zio_data_buf_free);
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
 	"Max I/O completion time (milliseconds) before marking it as slow");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
 	"Prioritize requeued I/O");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  UINT, ZMOD_RW,
 	"Defer frees starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW,
 	"Don't compress starting in this pass");
 
 ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW,
 	"Rewrite new bps starting in this pass");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
 	"Throttle block allocations in the ZIO pipeline");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
 	"Log all slow ZIOs, not just those with vdevs");
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 9373b39a184a..d7f3c75c7948 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -1,1090 +1,1091 @@
 # SPDX-License-Identifier: CDDL-1.0
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # This run file contains all of the common functional tests.  When
 # adding a new test consider also adding it to the sanity.run file
 # if the new test runs to completion in only a few seconds.
 #
 # Approximate run time: 4-5 hours
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 600
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 tags = ['functional']
 
 [tests/functional/acl/off]
 tests = ['dosmode', 'posixmode']
 tags = ['functional', 'acl']
 
 [tests/functional/alloc_class]
 tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
     'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
 tests = ['file_append', 'threadsappend_001_pos']
 tags = ['functional', 'append']
 
 [tests/functional/arc]
 tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos',
     'arcstats_runtime_tuning']
 tags = ['functional', 'arc']
 
 [tests/functional/atime]
 tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
 tags = ['functional', 'atime']
 
 [tests/functional/bclone]
 tests = ['bclone_crossfs_corner_cases_limited',
     'bclone_crossfs_data',
     'bclone_crossfs_embedded',
     'bclone_crossfs_hole',
     'bclone_diffprops_all',
     'bclone_diffprops_checksum',
     'bclone_diffprops_compress',
     'bclone_diffprops_copies',
     'bclone_diffprops_recordsize',
     'bclone_prop_sync',
     'bclone_samefs_corner_cases_limited',
     'bclone_samefs_data',
     'bclone_samefs_embedded',
     'bclone_samefs_hole']
 tags = ['functional', 'bclone']
 timeout = 7200
 
 [tests/functional/block_cloning]
 tests = ['block_cloning_clone_mmap_cached',
     'block_cloning_copyfilerange',
     'block_cloning_copyfilerange_partial',
     'block_cloning_copyfilerange_fallback',
     'block_cloning_disabled_copyfilerange',
     'block_cloning_copyfilerange_cross_dataset',
     'block_cloning_cross_enc_dataset',
     'block_cloning_copyfilerange_fallback_same_txg',
     'block_cloning_replay', 'block_cloning_replay_encrypted',
     'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
     'block_cloning_rlimit_fsize', 'block_cloning_large_offset']
 tags = ['functional', 'block_cloning']
 
 [tests/functional/bootfs]
 tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
     'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',
     'bootfs_008_pos']
 tags = ['functional', 'bootfs']
 
 [tests/functional/btree]
 tests = ['btree_positive', 'btree_negative']
 tags = ['functional', 'btree']
 pre =
 post =
 
 [tests/functional/cache]
 tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg',
     'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg',
     'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
 tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos',
     'cachefile_004_pos']
 tags = ['functional', 'cachefile']
 
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure',
     'sensitive_none_lookup', 'sensitive_none_delete',
     'sensitive_formd_lookup', 'sensitive_formd_delete',
     'insensitive_none_lookup', 'insensitive_none_delete',
     'insensitive_formd_lookup', 'insensitive_formd_delete',
     'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
     'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
 tags = ['functional', 'casenorm']
 
 [tests/functional/channel_program/lua_core]
 tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists',
     'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg',
     'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries',
     'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua',
     'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large',
     'tst.return_nvlist_neg', 'tst.return_nvlist_pos',
     'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout']
 tags = ['functional', 'channel_program', 'lua_core']
 
 [tests/functional/channel_program/synctask_core]
 tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
     'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg',
     'tst.get_number_props', 'tst.get_string_props', 'tst.get_type',
     'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks',
     'tst.list_children', 'tst.list_clones', 'tst.list_holds',
     'tst.list_snapshots', 'tst.list_system_props',
     'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
     'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
     'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg',
     'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple',
     'tst.bookmark.create', 'tst.bookmark.copy',
     'tst.terminate_by_signal'
     ]
 tags = ['functional', 'channel_program', 'synctask_core']
 
 [tests/functional/checksum]
 tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test',
     'filetest_001_pos', 'filetest_002_pos']
 tags = ['functional', 'checksum']
 
 [tests/functional/clean_mirror]
 tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']
 
 [tests/functional/cli_root/json]
 tests = ['json_sanity']
 tags = ['functional', 'cli_root', 'json']
 
 [tests/functional/cli_root/zinject]
 tests = ['zinject_args', 'zinject_counts', 'zinject_probe']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zinject']
 
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
     'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
     'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
 timeout = 1200
 
 [tests/functional/cli_root/zfs]
 tests = ['zfs_001_neg', 'zfs_002_pos']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_bookmark]
 tests = ['zfs_bookmark_cliargs']
 tags = ['functional', 'cli_root', 'zfs_bookmark']
 
 [tests/functional/cli_root/zfs_change-key]
 tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format',
     'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location',
     'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones']
 tags = ['functional', 'cli_root', 'zfs_change-key']
 
 [tests/functional/cli_root/zfs_clone]
 tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos',
     'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos',
     'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg',
     'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested',
     'zfs_clone_rm_nested']
 tags = ['functional', 'cli_root', 'zfs_clone']
 
 [tests/functional/cli_root/zfs_copies]
 tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos',
     'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos']
 tags = ['functional', 'cli_root', 'zfs_copies']
 
 [tests/functional/cli_root/zfs_create]
 tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
     'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos',
     'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg',
     'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos',
     'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted',
     'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount',
     'zfs_create_verbose']
 tags = ['functional', 'cli_root', 'zfs_create']
 
 [tests/functional/cli_root/zpool_prefetch]
 tests = ['zpool_prefetch_001_pos']
 tags = ['functional', 'cli_root', 'zpool_prefetch']
 
 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_clone_livelist_condense_and_disable',
     'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
     'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
     'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
     'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
     'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
 tags = ['functional', 'cli_root', 'zfs_destroy']
 
 [tests/functional/cli_root/zfs_diff]
 tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp',
     'zfs_diff_types', 'zfs_diff_encrypted', 'zfs_diff_mangle']
 tags = ['functional', 'cli_root', 'zfs_diff']
 
 [tests/functional/cli_root/zfs_get]
 tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos',
     'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg',
     'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg']
 tags = ['functional', 'cli_root', 'zfs_get']
 
 [tests/functional/cli_root/zfs_ids_to_path]
 tests = ['zfs_ids_to_path_001_pos']
 tags = ['functional', 'cli_root', 'zfs_ids_to_path']
 
 [tests/functional/cli_root/zfs_inherit]
 tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos',
     'zfs_inherit_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_inherit']
 
 [tests/functional/cli_root/zfs_load-key]
 tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file',
     'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop',
     'zfs_load-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_load-key']
 
 [tests/functional/cli_root/zfs_mount]
 tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
 tests = ['zfs_program_json']
 tags = ['functional', 'cli_root', 'zfs_program']
 
 [tests/functional/cli_root/zfs_promote]
 tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
     'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg',
     'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot']
 tags = ['functional', 'cli_root', 'zfs_promote']
 
 [tests/functional/cli_root/zfs_property]
 tests = ['zfs_written_property_001_pos']
 tags = ['functional', 'cli_root', 'zfs_property']
 
 [tests/functional/cli_root/zfs_receive]
 tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
     'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
     'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
     'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos',
     'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos',
     'zfs_receive_016_pos', 'receive-o-x_props_override',
     'receive-o-x_props_aliases',
     'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted',
     'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e',
     'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props',
     'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective',
     'zfs_receive_compressed_corrective', 'zfs_receive_large_block_corrective']
 tags = ['functional', 'cli_root', 'zfs_receive']
 
 [tests/functional/cli_root/zfs_rename]
 tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos',
     'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos',
     'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg',
     'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg',
     'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child',
     'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount']
 tags = ['functional', 'cli_root', 'zfs_rename']
 
 [tests/functional/cli_root/zfs_reservation]
 tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos',
     'zfs_rollback_003_neg', 'zfs_rollback_004_neg']
 tags = ['functional', 'cli_root', 'zfs_rollback']
 
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
     'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
     'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded',
     'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing']
 tags = ['functional', 'cli_root', 'zfs_send']
 
 [tests/functional/cli_root/zfs_set]
 tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
     'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
     'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos',
     'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos',
     'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
     'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
     'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg',
     'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos',
     'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation',
     'zfs_set_feature_activation', 'zfs_set_nomount']
 tags = ['functional', 'cli_root', 'zfs_set']
 
 [tests/functional/cli_root/zfs_share]
 tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos',
     'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg',
     'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares',
     'zfs_share_after_mount']
 tags = ['functional', 'cli_root', 'zfs_share']
 
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
     'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
     'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg',
     'zfs_snapshot_009_pos']
 tags = ['functional', 'cli_root', 'zfs_snapshot']
 
 [tests/functional/cli_root/zfs_unload-key]
 tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_unload-key']
 
 [tests/functional/cli_root/zfs_unmount]
 tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos',
     'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos',
     'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos',
     'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys']
 tags = ['functional', 'cli_root', 'zfs_unmount']
 
 [tests/functional/cli_root/zfs_unshare]
 tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos',
     'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos',
     'zfs_unshare_007_pos']
 tags = ['functional', 'cli_root', 'zfs_unshare']
 
 [tests/functional/cli_root/zfs_upgrade]
 tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos',
     'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg',
     'zfs_upgrade_007_neg']
 tags = ['functional', 'cli_root', 'zfs_upgrade']
 
 [tests/functional/cli_root/zfs_wait]
 tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt']
 tags = ['functional', 'cli_root', 'zfs_wait']
 
 [tests/functional/cli_root/zhack]
 tests = ['zhack_label_repair_001', 'zhack_label_repair_002',
     'zhack_label_repair_003', 'zhack_label_repair_004']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zhack']
 
 [tests/functional/cli_root/zpool]
 tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors']
 tags = ['functional', 'cli_root', 'zpool']
 
 [tests/functional/cli_root/zpool_add]
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
     'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
 tests = ['zpool_attach_001_neg', 'attach-o_ashift']
 tags = ['functional', 'cli_root', 'zpool_attach']
 
 [tests/functional/cli_root/zpool_clear]
 tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg',
     'zpool_clear_readonly']
 tags = ['functional', 'cli_root', 'zpool_clear']
 
 [tests/functional/cli_root/zpool_create]
 tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
     'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos',
     'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos',
     'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg',
     'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg',
     'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos',
     'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
     'zpool_create_023_neg', 'zpool_create_024_pos',
     'zpool_create_encrypted', 'zpool_create_crypt_combos',
     'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos',
     'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos',
     'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
     'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
     'zpool_create_features_005_pos', 'zpool_create_features_006_pos',
     'zpool_create_features_007_pos', 'zpool_create_features_008_pos',
     'zpool_create_features_009_pos', 'create-o_ashift',
     'zpool_create_tempname', 'zpool_create_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_create']
 
 [tests/functional/cli_root/zpool_destroy]
 tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos',
     'zpool_destroy_003_neg']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_destroy']
 
 [tests/functional/cli_root/zpool_detach]
 tests = ['zpool_detach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_detach']
 
 [tests/functional/cli_root/zpool_events]
 tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow',
     'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates',
     'zpool_events_clear_retained']
 tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_export]
 tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
     'zpool_export_003_neg', 'zpool_export_004_pos',
     'zpool_export_parallel_pos', 'zpool_export_parallel_admin']
 tags = ['functional', 'cli_root', 'zpool_export']
 
 [tests/functional/cli_root/zpool_get]
 tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos',
     'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos',
     'vdev_get_all']
 tags = ['functional', 'cli_root', 'zpool_get']
 
 [tests/functional/cli_root/zpool_history]
 tests = ['zpool_history_001_neg', 'zpool_history_002_pos']
 tags = ['functional', 'cli_root', 'zpool_history']
 
 [tests/functional/cli_root/zpool_import]
 tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos',
     'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos',
     'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg',
     'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
     'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos',
     'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
     'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
     'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos',
     'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
     'zpool_import_encrypted', 'zpool_import_encrypted_load',
     'zpool_import_errata3', 'zpool_import_errata4',
     'import_cachefile_device_added',
     'import_cachefile_device_removed',
     'import_cachefile_device_replaced',
     'import_cachefile_mirror_attached',
     'import_cachefile_mirror_detached',
     'import_cachefile_paths_changed',
     'import_cachefile_shared_device',
     'import_devices_missing', 'import_log_missing',
     'import_paths_changed',
     'import_rewind_config_changed',
     'import_rewind_device_replaced',
     'zpool_import_status', 'zpool_import_parallel_pos',
     'zpool_import_parallel_neg', 'zpool_import_parallel_admin']
 tags = ['functional', 'cli_root', 'zpool_import']
 timeout = 1200
 
 [tests/functional/cli_root/zpool_labelclear]
 tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
     'zpool_labelclear_removed', 'zpool_labelclear_valid']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_labelclear']
 
 [tests/functional/cli_root/zpool_initialize]
 tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_fault_export_import_online',
     'zpool_initialize_import_export',
     'zpool_initialize_offline_export_import_online',
     'zpool_initialize_online_offline',
     'zpool_initialize_split',
     'zpool_initialize_start_and_cancel_neg',
     'zpool_initialize_start_and_cancel_pos',
     'zpool_initialize_suspend_resume',
     'zpool_initialize_uninit',
     'zpool_initialize_unsupported_vdevs',
     'zpool_initialize_verify_checksums',
     'zpool_initialize_verify_initialized']
 pre =
 tags = ['functional', 'cli_root', 'zpool_initialize']
 
 [tests/functional/cli_root/zpool_offline]
 tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
     'zpool_offline_003_pos']
 tags = ['functional', 'cli_root', 'zpool_offline']
 
 [tests/functional/cli_root/zpool_online]
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
 [tests/functional/cli_root/zpool_reguid]
 tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
 tags = ['functional', 'cli_root', 'zpool_reguid']
 
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
 tags = ['functional', 'cli_root', 'zpool_remove']
 
 [tests/functional/cli_root/zpool_replace]
 tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
 tags = ['functional', 'cli_root', 'zpool_replace']
 
 [tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
     'zpool_resilver_concurrent']
 tags = ['functional', 'cli_root', 'zpool_resilver']
 
 [tests/functional/cli_root/zpool_scrub]
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
     'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
     'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
     'user_property_001_pos', 'user_property_002_neg',
     'zpool_set_clear_userprop']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
 tests = ['zpool_split_cliargs', 'zpool_split_devices',
     'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs',
     'zpool_split_resilver', 'zpool_split_indirect',
     'zpool_split_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
     'zpool_status_003_pos', 'zpool_status_004_pos',
     'zpool_status_005_pos', 'zpool_status_006_pos',
     'zpool_status_007_pos', 'zpool_status_008_pos',
     'zpool_status_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
 tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove',
     'zpool_trim_fault_export_import_online',
     'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
     'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
     'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume',
     'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums',
     'zpool_trim_verify_trimmed']
 tags = ['functional', 'zpool_trim']
 
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos',
     'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos',
     'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg',
     'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos',
     'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_upgrade']
 
 [tests/functional/cli_root/zpool_wait]
 tests = ['zpool_wait_discard', 'zpool_wait_freeing',
     'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel',
     'zpool_wait_initialize_flag', 'zpool_wait_multiple',
     'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel',
     'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag',
     'zpool_wait_usage']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
 tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
     'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
     'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_user/misc]
 tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg',
     'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg',
     'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg',
     'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg',
     'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg',
     'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg',
     'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg',
     'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg',
     'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg',
     'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg',
     'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg',
     'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg',
     'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg',
     'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos',
     'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege',
     'zilstat_001_pos']
 user =
 tags = ['functional', 'cli_user', 'misc']
 
 [tests/functional/cli_user/zfs_list]
 tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos',
     'zfs_list_004_neg', 'zfs_list_005_neg', 'zfs_list_007_pos',
     'zfs_list_008_neg']
 user =
 tags = ['functional', 'cli_user', 'zfs_list']
 
 [tests/functional/cli_user/zpool_iostat]
 tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
     'zpool_iostat_003_neg', 'zpool_iostat_004_pos',
     'zpool_iostat_005_pos', 'zpool_iostat_-c_disable',
     'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_iostat']
 
 [tests/functional/cli_user/zpool_list]
 tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
 [tests/functional/cli_user/zpool_status]
 tests = ['zpool_status_003_pos', 'zpool_status_-c_disable',
     'zpool_status_-c_homedir', 'zpool_status_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_status']
 
 [tests/functional/compression]
 tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
     'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled',
     'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc']
 tags = ['functional', 'compression']
 
 [tests/functional/cp_files]
 tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
 tags = ['functional', 'cp_files']
 
 [tests/functional/zap_shrink]
 tests = ['zap_shrink_001_pos']
 tags = ['functional', 'zap_shrink']
 
 [tests/functional/crtime]
 tests = ['crtime_001_pos' ]
 tags = ['functional', 'crtime']
 
 [tests/functional/crypto]
 tests = ['icp_aes_ccm', 'icp_aes_gcm']
 pre =
 post =
 tags = ['functional', 'crypto']
 
 [tests/functional/ctime]
 tests = ['ctime_001_pos' ]
 tags = ['functional', 'ctime']
 
 [tests/functional/deadman]
 tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio']
 pre =
 post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
 tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
     'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
     'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink']
 pre =
 post =
 tags = ['functional', 'dedup']
 
 [tests/functional/delegate]
 tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
     'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
     'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg',
     'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg',
     'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos',
     'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos',
     'zfs_unallow_007_neg', 'zfs_unallow_008_neg']
 tags = ['functional', 'delegate']
 
 [tests/functional/direct]
 tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
     'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block',
     'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites',
     'dio_property', 'dio_random', 'dio_read_verify', 'dio_recordsize',
     'dio_unaligned_block', 'dio_unaligned_filesize']
 tags = ['functional', 'direct']
 
 [tests/functional/exec]
 tests = ['exec_001_pos', 'exec_002_neg']
 tags = ['functional', 'exec']
 
 [tests/functional/fallocate]
 tests = ['fallocate_punch-hole']
 tags = ['functional', 'fallocate']
 
 [tests/functional/features/async_destroy]
 tests = ['async_destroy_001_pos']
 tags = ['functional', 'features', 'async_destroy']
 
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
     'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
 tags = ['functional', 'features', 'large_dnode']
 
 [tests/functional/gang_blocks]
-tests = ['gang_blocks_redundant', 'gang_blocks_ddt_copies']
+tests = ['gang_blocks_001_pos', 'gang_blocks_redundant',
+    'gang_blocks_ddt_copies']
 tags = ['functional', 'gang_blocks']
 
 [tests/functional/grow]
 pre =
 post =
 tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
 tags = ['functional', 'grow']
 
 [tests/functional/history]
 tests = ['history_001_pos', 'history_002_pos', 'history_003_pos',
     'history_004_pos', 'history_005_neg', 'history_006_neg',
     'history_007_pos', 'history_008_pos', 'history_009_pos',
     'history_010_pos']
 tags = ['functional', 'history']
 
 [tests/functional/hkdf]
 pre =
 post =
 tests = ['hkdf_test']
 tags = ['functional', 'hkdf']
 
 [tests/functional/inheritance]
 tests = ['inherit_001_pos']
 pre =
 tags = ['functional', 'inheritance']
 
 [tests/functional/io]
 tests = ['mmap', 'posixaio', 'psync', 'sync']
 tags = ['functional', 'io']
 
 [tests/functional/inuse]
 tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos']
 post =
 tags = ['functional', 'inuse']
 
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
 
 [tests/functional/limits]
 tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count',
     'snapshot_limit']
 tags = ['functional', 'limits']
 
 [tests/functional/link_count]
 tests = ['link_count_001', 'link_count_root_inode']
 tags = ['functional', 'link_count']
 
 [tests/functional/migration]
 tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
     'migration_004_pos', 'migration_005_pos', 'migration_006_pos',
     'migration_007_pos', 'migration_008_pos', 'migration_009_pos',
     'migration_010_pos', 'migration_011_pos', 'migration_012_pos']
 tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
     'mmap_sync_001_pos', 'mmap_write_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
 tests = ['umount_001', 'umountall_001']
 tags = ['functional', 'mount']
 
 [tests/functional/mv_files]
 tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation']
 tags = ['functional', 'mv_files']
 
 [tests/functional/nestedfs]
 tests = ['nestedfs_001_pos']
 tags = ['functional', 'nestedfs']
 
 [tests/functional/no_space]
 tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos',
     'enospc_df', 'enospc_ganging', 'enospc_rm']
 tags = ['functional', 'no_space']
 
 [tests/functional/nopwrite]
 tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative',
     'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync',
     'nopwrite_varying_compression', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
 [tests/functional/online_offline]
 tests = ['online_offline_001_pos', 'online_offline_002_neg',
     'online_offline_003_neg']
 tags = ['functional', 'online_offline']
 
 [tests/functional/pool_checkpoint]
 tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind',
     'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard',
     'checkpoint_discard_busy', 'checkpoint_discard_many',
     'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz',
     'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind',
     'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice',
     'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat']
 tags = ['functional', 'pool_checkpoint']
 timeout = 1800
 
 [tests/functional/pool_names]
 tests = ['pool_names_001_pos', 'pool_names_002_neg']
 pre =
 post =
 tags = ['functional', 'pool_names']
 
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
 
 [tests/functional/pyzfs]
 tests = ['pyzfs_unittest']
 pre =
 post =
 tags = ['functional', 'pyzfs']
 
 [tests/functional/quota]
 tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos',
          'quota_004_pos', 'quota_005_pos', 'quota_006_neg']
 tags = ['functional', 'quota']
 
 [tests/functional/redacted_send]
 tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
     'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes',
     'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones',
     'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative',
     'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume',
     'redacted_size', 'redacted_volume']
 tags = ['functional', 'redacted_send']
 
 [tests/functional/raidz]
 tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_expand_001_pos',
     'raidz_expand_002_pos', 'raidz_expand_003_neg', 'raidz_expand_003_pos',
     'raidz_expand_004_pos', 'raidz_expand_005_pos', 'raidz_expand_006_neg',
     'raidz_expand_007_neg']
 tags = ['functional', 'raidz']
 timeout = 1200
 
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
     'redundancy_draid3', 'redundancy_draid_damaged1',
     'redundancy_draid_damaged2', 'redundancy_draid_spare1',
     'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
     'redundancy_raidz3', 'redundancy_stripe']
 tags = ['functional', 'redundancy']
 timeout = 1200
 
 [tests/functional/refquota]
 tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
     'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg',
     'refquota_007_neg', 'refquota_008_neg']
 tags = ['functional', 'refquota']
 
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
     'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
     'refreserv_raidz']
 tags = ['functional', 'refreserv']
 
 [tests/functional/removal]
 pre =
 tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space',
     'removal_condense_export', 'removal_multiple_indirection',
     'removal_nopwrite', 'removal_remap_deadlists',
     'removal_resume_export', 'removal_sanity', 'removal_with_add',
     'removal_with_create_fs', 'removal_with_dedup',
     'removal_with_errors', 'removal_with_export', 'removal_with_indirect',
     'removal_with_ganging', 'removal_with_faulted',
     'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
     'removal_with_send_recv', 'removal_with_snapshot',
     'removal_with_write', 'removal_with_zdb', 'remove_expanded',
     'remove_mirror', 'remove_mirror_sanity', 'remove_raidz',
     'remove_indirect', 'remove_attach_mirror', 'removal_reservation',
     'removal_with_hole']
 tags = ['functional', 'removal']
 
 [tests/functional/rename_dirs]
 tests = ['rename_dirs_001_pos']
 tags = ['functional', 'rename_dirs']
 
 [tests/functional/replacement]
 tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
     'attach_resilver', 'detach', 'rebuild_disabled_feature',
     'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
     'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
     'scrub_cancel']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
 tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos',
     'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos',
     'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos',
     'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos',
     'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos',
     'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
 [tests/functional/rootpool]
 tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
 tags = ['functional', 'rootpool']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
     'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
     'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos',
     'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos',
     'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos',
     'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'rsend_025_pos',
     'rsend_026_neg', 'rsend_027_pos', 'rsend_028_neg', 'rsend_029_neg',
     'rsend_030_pos', 'rsend_031_pos', 'send-c_verify_ratio',
     'send-c_verify_contents', 'send-c_props', 'send-c_incremental',
     'send-c_volume', 'send-c_zstream_recompress', 'send-c_zstreamdump',
     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
     'send-c_mixed_compression', 'send-c_stream_size_estimate',
     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
     'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_incremental',
     'send_encrypted_freeobjects', 'send_encrypted_hierarchy',
     'send_encrypted_props', 'send_encrypted_truncated_files',
     'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files',
     'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw',
     'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid',
     'send_doall', 'send_raw_spill_block', 'send_raw_ashift',
     'send_raw_large_blocks']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
     'scrub_mirror_003_pos', 'scrub_mirror_004_pos']
 tags = ['functional', 'scrub_mirror']
 
 [tests/functional/slog]
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
     'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
     'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
     'slog_replay_fs_002', 'slog_replay_volume', 'slog_016_pos']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
 tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
     'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos',
     'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
     'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
     'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
     'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
     'snapshot_017_pos', 'snapshot_018_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/snapused]
 tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
     'snapused_004_pos', 'snapused_005_pos']
 tags = ['functional', 'snapused']
 
 [tests/functional/sparse]
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
 [tests/functional/stat]
 tests = ['stat_001_pos', 'statx_dioalign']
 tags = ['functional', 'stat']
 
 [tests/functional/suid]
 tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
     'suid_write_to_none', 'suid_write_zil_replay']
 tags = ['functional', 'suid']
 
 [tests/functional/trim]
 tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
     'trim_integrity', 'trim_config', 'trim_l2arc']
 tags = ['functional', 'trim']
 
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps']
 tags = ['functional', 'truncate']
 
 [tests/functional/upgrade]
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
 [tests/functional/userquota]
 tests = [
     'defaultuserquota_001_pos', 'defaultuserquota_002_pos',
     'defaultuserquota_003_pos', 'defaultuserquota_004_neg',
     'defaultuserquota_005_pos', 'defaultuserquota_006_pos',
     'defaultuserquota_007_pos', 'defaultuserquota_008_pos',
     'defaultuserquota_009_pos', 'defaultuserquota_010_neg',
     'defaultuserquota_011_neg', 'defaultuserquota_012_neg',
     'defaultuserquota_013_neg',
     'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos',
     'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos',
     'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos',
     'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg',
     'userspace_001_pos', 'userspace_002_pos', 'userspace_004_pos',
     'userspace_encrypted', 'userspace_send_encrypted',
     'userspace_encrypted_13709']
 tags = ['functional', 'userquota']
 
 [tests/functional/vdev_disk:Linux]
 pre =
 post =
 tests = ['page_alignment']
 tags = ['functional', 'vdev_disk']
 
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
     'vdev_zaps_007_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/write_dirs]
 tests = ['write_dirs_001_pos', 'write_dirs_002_pos']
 tags = ['functional', 'write_dirs']
 
 [tests/functional/xattr]
 tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
     'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
     'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat']
 tags = ['functional', 'xattr']
 
 [tests/functional/zvol/zvol_ENOSPC]
 tests = ['zvol_ENOSPC_001_pos']
 tags = ['functional', 'zvol', 'zvol_ENOSPC']
 
 [tests/functional/zvol/zvol_cli]
 tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 tags = ['functional', 'zvol', 'zvol_cli']
 
 [tests/functional/zvol/zvol_misc]
 tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
     'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil']
 tags = ['functional', 'zvol', 'zvol_misc']
 
 [tests/functional/zvol/zvol_stress]
 tests = ['zvol_stress']
 tags = ['functional', 'zvol', 'zvol_stress']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos']
 tags = ['functional', 'zvol', 'zvol_swap']
 
 [tests/functional/libzfs]
 tests = ['many_fds', 'libzfs_input']
 tags = ['functional', 'libzfs']
 
 [tests/functional/log_spacemap]
 tests = ['log_spacemap_import_logs']
 pre =
 post =
 tags = ['functional', 'log_spacemap']
 
 [tests/functional/l2arc]
 tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos',
     'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
     'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos']
 tags = ['functional', 'l2arc']
 
 [tests/functional/zpool_influxdb]
 tests = ['zpool_influxdb']
 tags = ['functional', 'zpool_influxdb']
diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run
index ddd2d431a5b6..6362a2606260 100644
--- a/tests/runfiles/sanity.run
+++ b/tests/runfiles/sanity.run
@@ -1,640 +1,644 @@
 # SPDX-License-Identifier: CDDL-1.0
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # This run file contains a subset of functional tests which exercise
 # as much functionality as possible while still executing relatively
 # quickly.  The included tests should take no more than a few seconds
 # each to run at most.  This provides a convenient way to sanity test a
 # change before committing to a full test run which takes several hours.
 #
 # Approximate run time: 15 minutes
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 180
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 tags = ['functional']
 
 [tests/functional/acl/off]
 tests = ['posixmode']
 tags = ['functional', 'acl']
 
 [tests/functional/alloc_class]
 tests = ['alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos',
     'alloc_class_006_pos', 'alloc_class_008_pos', 'alloc_class_010_pos',
     'alloc_class_011_neg']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/arc]
 tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'arcstats_runtime_tuning']
 tags = ['functional', 'arc']
 
 [tests/functional/bootfs]
 tests = ['bootfs_004_neg', 'bootfs_007_pos']
 tags = ['functional', 'bootfs']
 
 [tests/functional/cache]
 tests = ['cache_004_neg', 'cache_005_neg', 'cache_007_neg', 'cache_010_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
 tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos',
     'cachefile_004_pos']
 tags = ['functional', 'cachefile']
 
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values', 'sensitive_none_lookup',
     'sensitive_none_delete', 'insensitive_none_lookup',
     'insensitive_none_delete', 'mixed_none_lookup', 'mixed_none_delete']
 tags = ['functional', 'casenorm']
 
 [tests/functional/channel_program/lua_core]
 tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists',
     'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg',
     'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries',
     'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua',
     'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large',
     'tst.return_nvlist_neg', 'tst.return_nvlist_pos',
     'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout']
 tags = ['functional', 'channel_program', 'lua_core']
 
 [tests/functional/channel_program/synctask_core]
 tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
     'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg',
     'tst.get_number_props', 'tst.get_string_props', 'tst.get_type',
     'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks',
     'tst.list_children', 'tst.list_clones', 'tst.list_holds',
     'tst.list_snapshots', 'tst.list_system_props',
     'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
     'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
     'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy',
     'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple',
     'tst.bookmark.create', 'tst.bookmark.copy']
 tags = ['functional', 'channel_program', 'synctask_core']
 
 [tests/functional/cli_root/zdb]
 tests = ['zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
 
 [tests/functional/cli_root/zfs]
 tests = ['zfs_001_neg', 'zfs_002_pos']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_bookmark]
 tests = ['zfs_bookmark_cliargs']
 tags = ['functional', 'cli_root', 'zfs_bookmark']
 
 [tests/functional/cli_root/zfs_change-key]
 tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format',
     'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location',
     'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones']
 tags = ['functional', 'cli_root', 'zfs_change-key']
 
 [tests/functional/cli_root/zfs_clone]
 tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos',
     'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos',
     'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg',
     'zfs_clone_encrypted']
 tags = ['functional', 'cli_root', 'zfs_clone']
 
 [tests/functional/cli_root/zfs_create]
 tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
     'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos',
     'zfs_create_007_pos', 'zfs_create_011_pos', 'zfs_create_012_pos',
     'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted',
     'zfs_create_dryrun', 'zfs_create_verbose']
 tags = ['functional', 'cli_root', 'zfs_create']
 
 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_destroy_002_pos', 'zfs_destroy_003_pos',
     'zfs_destroy_004_pos', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg',
     'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos',
     'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg',
     'zfs_destroy_014_pos', 'zfs_destroy_dev_removal',
     'zfs_destroy_dev_removal_condense']
 tags = ['functional', 'cli_root', 'zfs_destroy']
 
 [tests/functional/cli_root/zfs_diff]
 tests = ['zfs_diff_cliargs', 'zfs_diff_encrypted']
 tags = ['functional', 'cli_root', 'zfs_diff']
 
 [tests/functional/cli_root/zfs_get]
 tests = ['zfs_get_003_pos', 'zfs_get_006_neg', 'zfs_get_007_neg',
     'zfs_get_010_neg']
 tags = ['functional', 'cli_root', 'zfs_get']
 
 [tests/functional/cli_root/zfs_inherit]
 tests = ['zfs_inherit_001_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_inherit']
 
 [tests/functional/cli_root/zfs_load-key]
 tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file',
     'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop',
     'zfs_load-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_load-key']
 
 [tests/functional/cli_root/zfs_mount]
 tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
     'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
 tests = ['zfs_program_json']
 tags = ['functional', 'cli_root', 'zfs_program']
 
 [tests/functional/cli_root/zfs_promote]
 tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
     'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg',
     'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot']
 tags = ['functional', 'cli_root', 'zfs_promote']
 
 [tests/functional/cli_root/zfs_receive]
 tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
     'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
     'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
     'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos',
     'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos',
     'zfs_receive_016_pos', 'zfs_receive_from_encrypted',
     'zfs_receive_to_encrypted', 'zfs_receive_raw',
     'zfs_receive_raw_incremental', 'zfs_receive_-e',
     'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props']
 tags = ['functional', 'cli_root', 'zfs_receive']
 
 [tests/functional/cli_root/zfs_rename]
 tests = ['zfs_rename_003_pos', 'zfs_rename_004_neg',
     'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos',
     'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg',
     'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos',
     'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted',
     'zfs_rename_mountpoint', 'zfs_rename_nounmount']
 tags = ['functional', 'cli_root', 'zfs_rename']
 
 [tests/functional/cli_root/zfs_reservation]
 tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg']
 tags = ['functional', 'cli_root', 'zfs_rollback']
 
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
     'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_encrypted',
     'zfs_send_raw']
 tags = ['functional', 'cli_root', 'zfs_send']
 
 [tests/functional/cli_root/zfs_set]
 tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
     'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
     'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos',
     'mountpoint_002_pos', 'user_property_002_pos',
     'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
     'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
     'user_property_004_pos', 'version_001_neg',
     'zfs_set_003_neg', 'property_alias_001_pos',
     'zfs_set_keylocation', 'zfs_set_feature_activation', 'zfs_set_nomount']
 tags = ['functional', 'cli_root', 'zfs_set']
 
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
     'zfs_snapshot_003_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg']
 tags = ['functional', 'cli_root', 'zfs_snapshot']
 
 [tests/functional/cli_root/zfs_unload-key]
 tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_unload-key']
 
 [tests/functional/cli_root/zfs_unmount]
 tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos',
     'zfs_unmount_004_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg',
     'zfs_unmount_009_pos', 'zfs_unmount_unload_keys']
 tags = ['functional', 'cli_root', 'zfs_unmount']
 
 [tests/functional/cli_root/zfs_upgrade]
 tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_006_neg',
     'zfs_upgrade_007_neg']
 tags = ['functional', 'cli_root', 'zfs_upgrade']
 
 [tests/functional/cli_root/zfs_wait]
 tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt']
 tags = ['functional', 'cli_root', 'zfs_wait']
 
 [tests/functional/cli_root/zpool]
 tests = ['zpool_001_neg', 'zpool_003_pos', 'zpool_colors']
 tags = ['functional', 'cli_root', 'zpool']
 
 [tests/functional/cli_root/zpool_add]
 tests = ['zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
 tests = ['zpool_attach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_attach']
 
 [tests/functional/cli_root/zpool_clear]
 tests = ['zpool_clear_002_neg']
 tags = ['functional', 'cli_root', 'zpool_clear']
 
 [tests/functional/cli_root/zpool_create]
 tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
     'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_007_neg',
     'zpool_create_008_pos', 'zpool_create_010_neg', 'zpool_create_011_neg',
     'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg',
     'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos',
     'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
     'zpool_create_encrypted',
     'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
     'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
     'zpool_create_features_005_pos']
 tags = ['functional', 'cli_root', 'zpool_create']
 
 [tests/functional/cli_root/zpool_destroy]
 tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos',
     'zpool_destroy_003_neg']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_destroy']
 
 [tests/functional/cli_root/zpool_detach]
 tests = ['zpool_detach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_detach']
 
 [tests/functional/cli_root/zpool_events]
 tests = ['zpool_events_clear', 'zpool_events_follow', 'zpool_events_poolname']
 tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_export]
 tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg']
 tags = ['functional', 'cli_root', 'zpool_export']
 
 [tests/functional/cli_root/zpool_get]
 tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos',
     'zpool_get_004_neg', 'zpool_get_005_pos']
 tags = ['functional', 'cli_root', 'zpool_get']
 
 [tests/functional/cli_root/zpool_history]
 tests = ['zpool_history_001_neg', 'zpool_history_002_pos']
 tags = ['functional', 'cli_root', 'zpool_history']
 
 [tests/functional/cli_root/zpool_import]
 tests = ['zpool_import_003_pos', 'zpool_import_010_pos', 'zpool_import_011_neg',
     'zpool_import_014_pos', 'zpool_import_features_001_pos',
     'zpool_import_all_001_pos', 'zpool_import_encrypted']
 tags = ['functional', 'cli_root', 'zpool_import']
 
 [tests/functional/cli_root/zpool_labelclear]
 tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
     'zpool_labelclear_removed', 'zpool_labelclear_valid']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_labelclear']
 
 [tests/functional/cli_root/zpool_initialize]
 tests = ['zpool_initialize_online_offline']
 pre =
 tags = ['functional', 'cli_root', 'zpool_initialize']
 
 [tests/functional/cli_root/zpool_offline]
 tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg']
 tags = ['functional', 'cli_root', 'zpool_offline']
 
 [tests/functional/cli_root/zpool_online]
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
 tags = ['functional', 'cli_root', 'zpool_remove']
 
 [tests/functional/cli_root/zpool_replace]
 tests = ['zpool_replace_001_neg']
 tags = ['functional', 'cli_root', 'zpool_replace']
 
 [tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args']
 tags = ['functional', 'cli_root', 'zpool_resilver']
 
 [tests/functional/cli_root/zpool_scrub]
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_003_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
 tests = ['zpool_split_cliargs', 'zpool_split_devices',
     'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_indirect']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
 tests = ['zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split',
     'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos']
 tags = ['functional', 'zpool_trim']
 
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_003_pos',
     'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg',
     'zpool_upgrade_009_neg']
 tags = ['functional', 'cli_root', 'zpool_upgrade']
 
 [tests/functional/cli_root/zpool_wait]
 tests = ['zpool_wait_no_activity', 'zpool_wait_usage']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
 tests = ['zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_user/misc]
 tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg',
     'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg',
     'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg',
     'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg',
     'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg',
     'zfs_snapshot_001_neg', 'zfs_unallow_001_neg',
     'zfs_unmount_001_neg', 'zfs_upgrade_001_neg',
     'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg',
     'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg',
     'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg',
     'zpool_history_001_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg',
     'zpool_remove_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg',
     'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos',
     'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege',
     'zilstat_001_pos']
 user =
 tags = ['functional', 'cli_user', 'misc']
 
 [tests/functional/cli_user/zpool_iostat]
 tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
     'zpool_iostat_003_neg', 'zpool_iostat_004_pos',
     'zpool_iostat_-c_disable',
     'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_iostat']
 
 [tests/functional/cli_user/zpool_list]
 tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
 [tests/functional/compression]
 tests = ['compress_003_pos','compress_zstd_bswap']
 tags = ['functional', 'compression']
 
 [tests/functional/exec]
 tests = ['exec_001_pos', 'exec_002_neg']
 tags = ['functional', 'exec']
 
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_003_pos', 'large_dnode_004_neg',
     'large_dnode_005_pos', 'large_dnode_007_neg']
 tags = ['functional', 'features', 'large_dnode']
 
+[tests/functional/gang_blocks]
+tests = ['gang_blocks_001_pos']
+tags = ['functional', 'gang_blocks']
+
 [tests/functional/grow]
 pre =
 post =
 tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
 tags = ['functional', 'grow']
 
 [tests/functional/history]
 tests = ['history_004_pos', 'history_005_neg', 'history_007_pos',
     'history_009_pos']
 tags = ['functional', 'history']
 
 [tests/functional/hkdf]
 pre =
 post =
 tests = ['hkdf_test']
 tags = ['functional', 'hkdf']
 
 [tests/functional/inuse]
 tests = ['inuse_004_pos', 'inuse_005_pos']
 post =
 tags = ['functional', 'inuse']
 
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
 
 [tests/functional/libzfs]
 tests = ['many_fds', 'libzfs_input']
 tags = ['functional', 'libzfs']
 
 [tests/functional/limits]
 tests = ['filesystem_count', 'snapshot_count']
 tags = ['functional', 'limits']
 
 [tests/functional/link_count]
 tests = ['link_count_root_inode']
 tags = ['functional', 'link_count']
 
 [tests/functional/log_spacemap]
 tests = ['log_spacemap_import_logs']
 pre =
 post =
 tags = ['functional', 'log_spacemap']
 
 [tests/functional/migration]
 tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
     'migration_004_pos', 'migration_005_pos', 'migration_006_pos',
     'migration_007_pos', 'migration_008_pos', 'migration_009_pos',
     'migration_010_pos', 'migration_011_pos', 'migration_012_pos']
 tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_read_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/nestedfs]
 tests = ['nestedfs_001_pos']
 tags = ['functional', 'nestedfs']
 
 [tests/functional/nopwrite]
 tests = ['nopwrite_sync', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
 [tests/functional/pool_checkpoint]
 tests = ['checkpoint_conf_change', 'checkpoint_discard_many',
     'checkpoint_removal', 'checkpoint_sm_scale', 'checkpoint_twice']
 tags = ['functional', 'pool_checkpoint']
 timeout = 1800
 
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
 
 [tests/functional/redacted_send]
 tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
     'redacted_disabled_feature', 'redacted_incrementals',
     'redacted_largeblocks', 'redacted_mixed_recsize', 'redacted_negative',
     'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size']
 tags = ['functional', 'redacted_send']
 
 [tests/functional/raidz]
 tests = ['raidz_001_neg']
 tags = ['functional', 'raidz']
 
 [tests/functional/refquota]
 tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
     'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg',
     'refquota_007_neg']
 tags = ['functional', 'refquota']
 
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
     'refreserv_005_pos', 'refreserv_multi_raidz']
 tags = ['functional', 'refreserv']
 
 [tests/functional/removal]
 pre =
 tests = ['removal_all_vdev', 'removal_sanity', 'removal_with_dedup',
     'removal_with_ganging', 'removal_with_faulted']
 tags = ['functional', 'removal']
 
 [tests/functional/replacement]
 tests = ['rebuild_raidz']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
 tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos',
     'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos',
     'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos',
     'reservation_014_pos', 'reservation_015_pos',
     'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos',
     'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
     'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
     'rsend_006_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos',
     'rsend_014_pos', 'rsend_016_neg', 'send-c_verify_contents',
     'send-c_volume', 'send-c_zstreamdump', 'send-c_recv_dedup',
     'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props',
     'send_encrypted_freeobjects',
     'send_encrypted_truncated_files', 'send_freeobjects', 'send_holds',
     'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset',
     'send_invalid']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos']
 tags = ['functional', 'scrub_mirror']
 
 [tests/functional/slog]
 tests = ['slog_008_neg', 'slog_009_neg', 'slog_010_neg']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
 tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
     'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos',
     'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos',
     'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos',
     'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos',
     'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos',
     'snapshot_018_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/snapused]
 tests = ['snapused_002_pos', 'snapused_004_pos', 'snapused_005_pos']
 tags = ['functional', 'snapused']
 
 [tests/functional/sparse]
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
 [tests/functional/suid]
 tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
     'suid_write_to_none']
 tags = ['functional', 'suid']
 
 [tests/functional/append]
 tests = ['threadsappend_001_pos']
 tags = ['functional', 'threadsappend']
 
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos']
 tags = ['functional', 'truncate']
 
 [tests/functional/upgrade]
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
 [tests/functional/vdev_disk:Linux]
 pre =
 post =
 tests = ['page_alignment']
 tags = ['functional', 'vdev_disk']
 
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos',
     'vdev_zaps_005_pos', 'vdev_zaps_006_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/xattr]
 tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
     'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
     'xattr_011_pos', 'xattr_013_pos', 'xattr_compat']
 tags = ['functional', 'xattr']
 
 [tests/functional/zvol/zvol_ENOSPC]
 tests = ['zvol_ENOSPC_001_pos']
 tags = ['functional', 'zvol', 'zvol_ENOSPC']
 
 [tests/functional/zvol/zvol_cli]
 tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 tags = ['functional', 'zvol', 'zvol_cli']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos']
 tags = ['functional', 'zvol', 'zvol_swap']
 
 [tests/functional/zpool_influxdb]
 tests = ['zpool_influxdb']
 tags = ['functional', 'zpool_influxdb']
 
 [tests/functional/pyzfs]
 tests = ['pyzfs_unittest']
 pre =
 post =
 tags = ['functional', 'pyzfs']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index db1ef0d03aaf..4c102b3aa1b8 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1,2225 +1,2226 @@
 CLEANFILES =
 dist_noinst_DATA =
 include $(top_srcdir)/config/Substfiles.am
 
 
 datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests
 nobase_dist_datadir_zfs_tests_tests_DATA = \
 	perf/nfs-sample.cfg \
 	perf/perf.shlib \
 	\
 	perf/fio/mkfiles.fio \
 	perf/fio/random_reads.fio \
 	perf/fio/random_readwrite.fio \
 	perf/fio/random_readwrite_fixed.fio \
 	perf/fio/random_writes.fio \
 	perf/fio/sequential_reads.fio \
 	perf/fio/sequential_readwrite.fio \
 	perf/fio/sequential_writes.fio
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \
 	perf/regression/random_reads.ksh \
 	perf/regression/random_readwrite.ksh \
 	perf/regression/random_readwrite_fixed.ksh \
 	perf/regression/random_writes.ksh \
 	perf/regression/random_writes_zil.ksh \
 	perf/regression/sequential_reads_arc_cached_clone.ksh \
 	perf/regression/sequential_reads_arc_cached.ksh \
 	perf/regression/sequential_reads_dbuf_cached.ksh \
 	perf/regression/sequential_reads.ksh \
 	perf/regression/sequential_writes.ksh \
 	perf/regression/setup.ksh \
 	\
 	perf/scripts/prefetch_io.sh
 
 # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source:
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #
 # simd and tmpfile are Linux-only and not installed elsewhere
 #
 # C programs are specced in ../Makefile.am above as part of the main Makefile
 
 find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'
 regen:
 	@$(MAKE) -C $(top_builddir) clean
 	@$(MAKE) clean
 	$(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am
 	$(find_common) ! -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am
 	$(find_common)   -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am
 	echo >> Makefile.am
 	echo 'if BUILD_LINUX' >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'endif' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am
 	$(find_common) ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 
 # -- >8 --
 
 nobase_nodist_datadir_zfs_tests_tests_DATA = \
 	functional/pam/utilities.kshlib
 nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \
 	functional/pyzfs/pyzfs_unittest.ksh
 
 SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)
 
 if BUILD_LINUX
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/simd/simd_supported.ksh \
 	functional/tmpfile/cleanup.ksh \
 	functional/tmpfile/setup.ksh \
 	functional/luks/luks_sanity.ksh
 endif
 
 nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/acl/acl.cfg \
 	functional/acl/acl_common.kshlib \
 	functional/alloc_class/alloc_class.cfg \
 	functional/alloc_class/alloc_class.kshlib \
 	functional/atime/atime.cfg \
 	functional/atime/atime_common.kshlib \
 	functional/bclone/bclone.cfg \
 	functional/bclone/bclone_common.kshlib \
 	functional/bclone/bclone_corner_cases.kshlib \
 	functional/block_cloning/block_cloning.kshlib \
 	functional/cache/cache.cfg \
 	functional/cache/cache.kshlib \
 	functional/cachefile/cachefile.cfg \
 	functional/cachefile/cachefile.kshlib \
 	functional/casenorm/casenorm.cfg \
 	functional/casenorm/casenorm.kshlib \
 	functional/channel_program/channel_common.kshlib \
 	functional/channel_program/lua_core/tst.args_to_lua.out \
 	functional/channel_program/lua_core/tst.args_to_lua.zcp \
 	functional/channel_program/lua_core/tst.divide_by_zero.err \
 	functional/channel_program/lua_core/tst.divide_by_zero.zcp \
 	functional/channel_program/lua_core/tst.exists.zcp \
 	functional/channel_program/lua_core/tst.large_prog.out \
 	functional/channel_program/lua_core/tst.large_prog.zcp \
 	functional/channel_program/lua_core/tst.lib_base.lua \
 	functional/channel_program/lua_core/tst.lib_coroutine.lua \
 	functional/channel_program/lua_core/tst.lib_strings.lua \
 	functional/channel_program/lua_core/tst.lib_table.lua \
 	functional/channel_program/lua_core/tst.nested_neg.zcp \
 	functional/channel_program/lua_core/tst.nested_pos.zcp \
 	functional/channel_program/lua_core/tst.recursive.zcp \
 	functional/channel_program/lua_core/tst.return_large.zcp \
 	functional/channel_program/lua_core/tst.return_recursive_table.zcp \
 	functional/channel_program/lua_core/tst.stack_gsub.err \
 	functional/channel_program/lua_core/tst.stack_gsub.zcp \
 	functional/channel_program/lua_core/tst.timeout.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.copy.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.create.zcp \
 	functional/channel_program/synctask_core/tst.get_index_props.out \
 	functional/channel_program/synctask_core/tst.get_index_props.zcp \
 	functional/channel_program/synctask_core/tst.get_number_props.out \
 	functional/channel_program/synctask_core/tst.get_number_props.zcp \
 	functional/channel_program/synctask_core/tst.get_string_props.out \
 	functional/channel_program/synctask_core/tst.get_string_props.zcp \
 	functional/channel_program/synctask_core/tst.promote_conflict.zcp \
 	functional/channel_program/synctask_core/tst.set_props.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_neg.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_rename.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_simple.zcp \
 	functional/checksum/default.cfg \
 	functional/clean_mirror/clean_mirror_common.kshlib \
 	functional/clean_mirror/default.cfg \
 	functional/crypto/aes_ccm_test.json \
 	functional/crypto/aes_ccm_test.txt \
 	functional/crypto/aes_gcm_test.json \
 	functional/crypto/aes_gcm_test.txt \
 	functional/cli_root/cli_common.kshlib \
 	functional/cli_root/zfs_copies/zfs_copies.cfg \
 	functional/cli_root/zfs_copies/zfs_copies.kshlib \
 	functional/cli_root/zfs_create/properties.kshlib \
 	functional/cli_root/zfs_create/zfs_create.cfg \
 	functional/cli_root/zfs_create/zfs_create_common.kshlib \
 	functional/cli_root/zfs_destroy/zfs_destroy.cfg \
 	functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_list_d.kshlib \
 	functional/cli_root/zfs_jail/jail.conf \
 	functional/cli_root/zfs_load-key/HEXKEY \
 	functional/cli_root/zfs_load-key/PASSPHRASE \
 	functional/cli_root/zfs_load-key/RAWKEY \
 	functional/cli_root/zfs_load-key/zfs_load-key.cfg \
 	functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \
 	functional/cli_root/zfs_mount/zfs_mount.cfg \
 	functional/cli_root/zfs_mount/zfs_mount.kshlib \
 	functional/cli_root/zfs_promote/zfs_promote.cfg \
 	functional/cli_root/zfs_receive/zstd_test_data.txt \
 	functional/cli_root/zfs_rename/zfs_rename.cfg \
 	functional/cli_root/zfs_rename/zfs_rename.kshlib \
 	functional/cli_root/zfs_rollback/zfs_rollback.cfg \
 	functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \
 	functional/cli_root/zfs_send/zfs_send.cfg \
 	functional/cli_root/zfs_set/zfs_set_common.kshlib \
 	functional/cli_root/zfs_share/zfs_share.cfg \
 	functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.kshlib \
 	functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \
 	functional/cli_root/zfs_wait/zfs_wait.kshlib \
 	functional/cli_root/zpool_add/zpool_add.cfg \
 	functional/cli_root/zpool_add/zpool_add.kshlib \
 	functional/cli_root/zpool_clear/zpool_clear.cfg \
 	functional/cli_root/zpool_create/draidcfg.gz \
 	functional/cli_root/zpool_create/zpool_create.cfg \
 	functional/cli_root/zpool_create/zpool_create.shlib \
 	functional/cli_root/zpool_destroy/zpool_destroy.cfg \
 	functional/cli_root/zpool_events/zpool_events.cfg \
 	functional/cli_root/zpool_events/zpool_events.kshlib \
 	functional/cli_root/zpool_expand/zpool_expand.cfg \
 	functional/cli_root/zpool_export/zpool_export.cfg \
 	functional/cli_root/zpool_export/zpool_export.kshlib \
 	functional/cli_root/zpool_get/vdev_get.cfg \
 	functional/cli_root/zpool_get/zpool_get.cfg \
 	functional/cli_root/zpool_get/zpool_get_parsable.cfg \
 	functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \
 	functional/cli_root/zpool_import/zpool_import.cfg \
 	functional/cli_root/zpool_import/zpool_import.kshlib \
 	functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
 	functional/cli_root/zpool_labelclear/labelclear.cfg \
 	functional/cli_root/zpool_remove/zpool_remove.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.shlib \
 	functional/cli_root/zpool_resilver/zpool_resilver.cfg \
 	functional/cli_root/zpool_scrub/zpool_scrub.cfg \
 	functional/cli_root/zpool_split/zpool_split.cfg \
 	functional/cli_root/zpool_trim/zpool_trim.kshlib \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \
 	functional/cli_root/zpool_wait/zpool_wait.kshlib \
 	functional/cli_root/zhack/library.kshlib \
 	functional/cli_user/misc/misc.cfg \
 	functional/cli_user/zfs_list/zfs_list.cfg \
 	functional/cli_user/zfs_list/zfs_list.kshlib \
 	functional/compression/compress.cfg \
 	functional/compression/testpool_zstd.tar.gz \
 	functional/deadman/deadman.cfg \
 	functional/delegate/delegate.cfg \
 	functional/delegate/delegate_common.kshlib \
 	functional/devices/devices.cfg \
 	functional/devices/devices_common.kshlib \
 	functional/direct/dio.cfg \
 	functional/direct/dio.kshlib \
 	functional/events/events.cfg \
 	functional/events/events_common.kshlib \
 	functional/fault/fault.cfg \
 	functional/gang_blocks/gang_blocks.kshlib \
 	functional/grow/grow.cfg \
 	functional/history/history.cfg \
 	functional/history/history_common.kshlib \
 	functional/history/i386.migratedpool.DAT.Z \
 	functional/history/i386.orig_history.txt \
 	functional/history/sparc.migratedpool.DAT.Z \
 	functional/history/sparc.orig_history.txt \
 	functional/history/zfs-pool-v4.dat.Z \
 	functional/inheritance/config001.cfg \
 	functional/inheritance/config002.cfg \
 	functional/inheritance/config003.cfg \
 	functional/inheritance/config004.cfg \
 	functional/inheritance/config005.cfg \
 	functional/inheritance/config006.cfg \
 	functional/inheritance/config007.cfg \
 	functional/inheritance/config008.cfg \
 	functional/inheritance/config009.cfg \
 	functional/inheritance/config010.cfg \
 	functional/inheritance/config011.cfg \
 	functional/inheritance/config012.cfg \
 	functional/inheritance/config013.cfg \
 	functional/inheritance/config014.cfg \
 	functional/inheritance/config015.cfg \
 	functional/inheritance/config016.cfg \
 	functional/inheritance/config017.cfg \
 	functional/inheritance/config018.cfg \
 	functional/inheritance/config019.cfg \
 	functional/inheritance/config020.cfg \
 	functional/inheritance/config021.cfg \
 	functional/inheritance/config022.cfg \
 	functional/inheritance/config023.cfg \
 	functional/inheritance/config024.cfg \
 	functional/inheritance/inherit.kshlib \
 	functional/inheritance/README.config \
 	functional/inheritance/README.state \
 	functional/inheritance/state001.cfg \
 	functional/inheritance/state002.cfg \
 	functional/inheritance/state003.cfg \
 	functional/inheritance/state004.cfg \
 	functional/inheritance/state005.cfg \
 	functional/inheritance/state006.cfg \
 	functional/inheritance/state007.cfg \
 	functional/inheritance/state008.cfg \
 	functional/inheritance/state009.cfg \
 	functional/inheritance/state010.cfg \
 	functional/inheritance/state011.cfg \
 	functional/inheritance/state012.cfg \
 	functional/inheritance/state013.cfg \
 	functional/inheritance/state014.cfg \
 	functional/inheritance/state015.cfg \
 	functional/inheritance/state016.cfg \
 	functional/inheritance/state017.cfg \
 	functional/inheritance/state018.cfg \
 	functional/inheritance/state019.cfg \
 	functional/inheritance/state020.cfg \
 	functional/inheritance/state021.cfg \
 	functional/inheritance/state022.cfg \
 	functional/inheritance/state023.cfg \
 	functional/inheritance/state024.cfg \
 	functional/inuse/inuse.cfg \
 	functional/io/io.cfg \
 	functional/l2arc/l2arc.cfg \
 	functional/largest_pool/largest_pool.cfg \
 	functional/migration/migration.cfg \
 	functional/migration/migration.kshlib \
 	functional/mmap/mmap.cfg \
 	functional/mmp/mmp.cfg \
 	functional/mmp/mmp.kshlib \
 	functional/mv_files/mv_files.cfg \
 	functional/mv_files/mv_files_common.kshlib \
 	functional/nopwrite/nopwrite.shlib \
 	functional/no_space/enospc.cfg \
 	functional/online_offline/online_offline.cfg \
 	functional/pool_checkpoint/pool_checkpoint.kshlib \
 	functional/projectquota/projectquota.cfg \
 	functional/projectquota/projectquota_common.kshlib \
 	functional/quota/quota.cfg \
 	functional/quota/quota.kshlib \
 	functional/redacted_send/redacted.cfg \
 	functional/redacted_send/redacted.kshlib \
 	functional/redundancy/redundancy.cfg \
 	functional/redundancy/redundancy.kshlib \
 	functional/refreserv/refreserv.cfg \
 	functional/removal/removal.kshlib \
 	functional/replacement/replacement.cfg \
 	functional/reservation/reservation.cfg \
 	functional/reservation/reservation.shlib \
 	functional/rsend/dedup_encrypted_zvol.bz2 \
 	functional/rsend/dedup_encrypted_zvol.zsend.bz2 \
 	functional/rsend/dedup.zsend.bz2 \
 	functional/rsend/fs.tar.gz \
 	functional/rsend/rsend.cfg \
 	functional/rsend/rsend.kshlib \
 	functional/scrub_mirror/default.cfg \
 	functional/scrub_mirror/scrub_mirror_common.kshlib \
 	functional/slog/slog.cfg \
 	functional/slog/slog.kshlib \
 	functional/snapshot/snapshot.cfg \
 	functional/snapused/snapused.kshlib \
 	functional/sparse/sparse.cfg \
 	functional/trim/trim.cfg \
 	functional/trim/trim.kshlib \
 	functional/truncate/truncate.cfg \
 	functional/upgrade/upgrade_common.kshlib \
 	functional/user_namespace/user_namespace.cfg \
 	functional/user_namespace/user_namespace_common.kshlib \
 	functional/userquota/13709_reproducer.bz2 \
 	functional/userquota/userquota.cfg \
 	functional/userquota/userquota_common.kshlib \
 	functional/vdev_zaps/vdev_zaps.kshlib \
 	functional/xattr/xattr.cfg \
 	functional/xattr/xattr_common.kshlib \
 	functional/zvol/zvol.cfg \
 	functional/zvol/zvol_cli/zvol_cli.cfg \
 	functional/zvol/zvol_common.shlib \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \
 	functional/zvol/zvol_misc/zvol_misc_common.kshlib \
 	functional/zvol/zvol_swap/zvol_swap.cfg \
 	functional/idmap_mount/idmap_mount.cfg \
 	functional/idmap_mount/idmap_mount_common.kshlib
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/acl/off/cleanup.ksh \
 	functional/acl/off/dosmode.ksh \
 	functional/acl/off/posixmode.ksh \
 	functional/acl/off/setup.ksh \
 	functional/acl/posix/cleanup.ksh \
 	functional/acl/posix/posix_001_pos.ksh \
 	functional/acl/posix/posix_002_pos.ksh \
 	functional/acl/posix/posix_003_pos.ksh \
 	functional/acl/posix/posix_004_pos.ksh \
 	functional/acl/posix-sa/cleanup.ksh \
 	functional/acl/posix-sa/posix_001_pos.ksh \
 	functional/acl/posix-sa/posix_002_pos.ksh \
 	functional/acl/posix-sa/posix_003_pos.ksh \
 	functional/acl/posix-sa/posix_004_pos.ksh \
 	functional/acl/posix-sa/setup.ksh \
 	functional/acl/posix/setup.ksh \
 	functional/alloc_class/alloc_class_001_pos.ksh \
 	functional/alloc_class/alloc_class_002_neg.ksh \
 	functional/alloc_class/alloc_class_003_pos.ksh \
 	functional/alloc_class/alloc_class_004_pos.ksh \
 	functional/alloc_class/alloc_class_005_pos.ksh \
 	functional/alloc_class/alloc_class_006_pos.ksh \
 	functional/alloc_class/alloc_class_007_pos.ksh \
 	functional/alloc_class/alloc_class_008_pos.ksh \
 	functional/alloc_class/alloc_class_009_pos.ksh \
 	functional/alloc_class/alloc_class_010_pos.ksh \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
 	functional/alloc_class/alloc_class_014_neg.ksh \
 	functional/alloc_class/alloc_class_015_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
 	functional/append/file_append.ksh \
 	functional/append/threadsappend_001_pos.ksh \
 	functional/append/cleanup.ksh \
 	functional/append/setup.ksh \
 	functional/arc/arcstats_runtime_tuning.ksh \
 	functional/arc/cleanup.ksh \
 	functional/arc/dbufstats_001_pos.ksh \
 	functional/arc/dbufstats_002_pos.ksh \
 	functional/arc/dbufstats_003_pos.ksh \
 	functional/arc/setup.ksh \
 	functional/atime/atime_001_pos.ksh \
 	functional/atime/atime_002_neg.ksh \
 	functional/atime/atime_003_pos.ksh \
 	functional/atime/cleanup.ksh \
 	functional/atime/root_atime_off.ksh \
 	functional/atime/root_atime_on.ksh \
 	functional/atime/root_relatime_on.ksh \
 	functional/atime/setup.ksh \
 	functional/bclone/bclone_crossfs_corner_cases.ksh \
 	functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
 	functional/bclone/bclone_crossfs_data.ksh \
 	functional/bclone/bclone_crossfs_embedded.ksh \
 	functional/bclone/bclone_crossfs_hole.ksh \
 	functional/bclone/bclone_diffprops_all.ksh \
 	functional/bclone/bclone_diffprops_checksum.ksh \
 	functional/bclone/bclone_diffprops_compress.ksh \
 	functional/bclone/bclone_diffprops_copies.ksh \
 	functional/bclone/bclone_diffprops_recordsize.ksh \
 	functional/bclone/bclone_prop_sync.ksh \
 	functional/bclone/bclone_samefs_corner_cases.ksh \
 	functional/bclone/bclone_samefs_corner_cases_limited.ksh \
 	functional/bclone/bclone_samefs_data.ksh \
 	functional/bclone/bclone_samefs_embedded.ksh \
 	functional/bclone/bclone_samefs_hole.ksh \
 	functional/bclone/cleanup.ksh \
 	functional/bclone/setup.ksh \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_write.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlone.ksh \
 	functional/block_cloning/block_cloning_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
 	functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
 	functional/block_cloning/block_cloning_replay.ksh \
 	functional/block_cloning/block_cloning_replay_encrypted.ksh \
 	functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
 	functional/block_cloning/block_cloning_rlimit_fsize.ksh \
 	functional/block_cloning/block_cloning_large_offset.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
 	functional/bootfs/bootfs_004_neg.ksh \
 	functional/bootfs/bootfs_005_neg.ksh \
 	functional/bootfs/bootfs_006_pos.ksh \
 	functional/bootfs/bootfs_007_pos.ksh \
 	functional/bootfs/bootfs_008_pos.ksh \
 	functional/bootfs/cleanup.ksh \
 	functional/bootfs/setup.ksh \
 	functional/btree/btree_negative.ksh \
 	functional/btree/btree_positive.ksh \
 	functional/cache/cache_001_pos.ksh \
 	functional/cache/cache_002_pos.ksh \
 	functional/cache/cache_003_pos.ksh \
 	functional/cache/cache_004_neg.ksh \
 	functional/cache/cache_005_neg.ksh \
 	functional/cache/cache_006_pos.ksh \
 	functional/cache/cache_007_neg.ksh \
 	functional/cache/cache_008_neg.ksh \
 	functional/cache/cache_009_pos.ksh \
 	functional/cache/cache_010_pos.ksh \
 	functional/cache/cache_011_pos.ksh \
 	functional/cache/cache_012_pos.ksh \
 	functional/cache/cleanup.ksh \
 	functional/cachefile/cachefile_001_pos.ksh \
 	functional/cachefile/cachefile_002_pos.ksh \
 	functional/cachefile/cachefile_003_pos.ksh \
 	functional/cachefile/cachefile_004_pos.ksh \
 	functional/cachefile/cleanup.ksh \
 	functional/cachefile/setup.ksh \
 	functional/cache/setup.ksh \
 	functional/casenorm/case_all_values.ksh \
 	functional/casenorm/cleanup.ksh \
 	functional/casenorm/insensitive_formd_delete.ksh \
 	functional/casenorm/insensitive_formd_lookup.ksh \
 	functional/casenorm/insensitive_none_delete.ksh \
 	functional/casenorm/insensitive_none_lookup.ksh \
 	functional/casenorm/mixed_create_failure.ksh \
 	functional/casenorm/mixed_formd_delete.ksh \
 	functional/casenorm/mixed_formd_lookup_ci.ksh \
 	functional/casenorm/mixed_formd_lookup.ksh \
 	functional/casenorm/mixed_none_delete.ksh \
 	functional/casenorm/mixed_none_lookup_ci.ksh \
 	functional/casenorm/mixed_none_lookup.ksh \
 	functional/casenorm/norm_all_values.ksh \
 	functional/casenorm/sensitive_formd_delete.ksh \
 	functional/casenorm/sensitive_formd_lookup.ksh \
 	functional/casenorm/sensitive_none_delete.ksh \
 	functional/casenorm/sensitive_none_lookup.ksh \
 	functional/casenorm/setup.ksh \
 	functional/channel_program/lua_core/cleanup.ksh \
 	functional/channel_program/lua_core/setup.ksh \
 	functional/channel_program/lua_core/tst.args_to_lua.ksh \
 	functional/channel_program/lua_core/tst.divide_by_zero.ksh \
 	functional/channel_program/lua_core/tst.exists.ksh \
 	functional/channel_program/lua_core/tst.integer_illegal.ksh \
 	functional/channel_program/lua_core/tst.integer_overflow.ksh \
 	functional/channel_program/lua_core/tst.language_functions_neg.ksh \
 	functional/channel_program/lua_core/tst.language_functions_pos.ksh \
 	functional/channel_program/lua_core/tst.large_prog.ksh \
 	functional/channel_program/lua_core/tst.libraries.ksh \
 	functional/channel_program/lua_core/tst.memory_limit.ksh \
 	functional/channel_program/lua_core/tst.nested_neg.ksh \
 	functional/channel_program/lua_core/tst.nested_pos.ksh \
 	functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \
 	functional/channel_program/lua_core/tst.recursive_neg.ksh \
 	functional/channel_program/lua_core/tst.recursive_pos.ksh \
 	functional/channel_program/lua_core/tst.return_large.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \
 	functional/channel_program/lua_core/tst.return_recursive_table.ksh \
 	functional/channel_program/lua_core/tst.stack_gsub.ksh \
 	functional/channel_program/lua_core/tst.timeout.ksh \
 	functional/channel_program/synctask_core/cleanup.ksh \
 	functional/channel_program/synctask_core/setup.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.copy.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.create.ksh \
 	functional/channel_program/synctask_core/tst.destroy_fs.ksh \
 	functional/channel_program/synctask_core/tst.destroy_snap.ksh \
 	functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \
 	functional/channel_program/synctask_core/tst.get_index_props.ksh \
 	functional/channel_program/synctask_core/tst.get_mountpoint.ksh \
 	functional/channel_program/synctask_core/tst.get_neg.ksh \
 	functional/channel_program/synctask_core/tst.get_number_props.ksh \
 	functional/channel_program/synctask_core/tst.get_string_props.ksh \
 	functional/channel_program/synctask_core/tst.get_type.ksh \
 	functional/channel_program/synctask_core/tst.get_userquota.ksh \
 	functional/channel_program/synctask_core/tst.get_written.ksh \
 	functional/channel_program/synctask_core/tst.inherit.ksh \
 	functional/channel_program/synctask_core/tst.list_bookmarks.ksh \
 	functional/channel_program/synctask_core/tst.list_children.ksh \
 	functional/channel_program/synctask_core/tst.list_clones.ksh \
 	functional/channel_program/synctask_core/tst.list_holds.ksh \
 	functional/channel_program/synctask_core/tst.list_snapshots.ksh \
 	functional/channel_program/synctask_core/tst.list_system_props.ksh \
 	functional/channel_program/synctask_core/tst.list_user_props.ksh \
 	functional/channel_program/synctask_core/tst.parse_args_neg.ksh \
 	functional/channel_program/synctask_core/tst.promote_conflict.ksh \
 	functional/channel_program/synctask_core/tst.promote_multiple.ksh \
 	functional/channel_program/synctask_core/tst.promote_simple.ksh \
 	functional/channel_program/synctask_core/tst.rollback_mult.ksh \
 	functional/channel_program/synctask_core/tst.rollback_one.ksh \
 	functional/channel_program/synctask_core/tst.set_props.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_neg.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_rename.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_simple.ksh \
 	functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \
 	functional/chattr/chattr_001_pos.ksh \
 	functional/chattr/chattr_002_neg.ksh \
 	functional/chattr/cleanup.ksh \
 	functional/chattr/setup.ksh \
 	functional/checksum/cleanup.ksh \
 	functional/checksum/filetest_001_pos.ksh \
 	functional/checksum/filetest_002_pos.ksh \
 	functional/checksum/run_blake3_test.ksh \
 	functional/checksum/run_edonr_test.ksh \
 	functional/checksum/run_sha2_test.ksh \
 	functional/checksum/run_skein_test.ksh \
 	functional/checksum/setup.ksh \
 	functional/clean_mirror/clean_mirror_001_pos.ksh \
 	functional/clean_mirror/clean_mirror_002_pos.ksh \
 	functional/clean_mirror/clean_mirror_003_pos.ksh \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
 	functional/cli_root/json/cleanup.ksh \
 	functional/cli_root/json/setup.ksh \
 	functional/cli_root/json/json_sanity.ksh \
 	functional/cli_root/zinject/zinject_args.ksh \
 	functional/cli_root/zinject/zinject_counts.ksh \
 	functional/cli_root/zinject/zinject_probe.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
 	functional/cli_root/zdb/zdb_005_pos.ksh \
 	functional/cli_root/zdb/zdb_006_pos.ksh \
 	functional/cli_root/zdb/zdb_args_neg.ksh \
 	functional/cli_root/zdb/zdb_args_pos.ksh \
 	functional/cli_root/zdb/zdb_backup.ksh \
 	functional/cli_root/zdb/zdb_block_size_histogram.ksh \
 	functional/cli_root/zdb/zdb_checksum.ksh \
 	functional/cli_root/zdb/zdb_decompress.ksh \
 	functional/cli_root/zdb/zdb_decompress_zstd.ksh \
 	functional/cli_root/zdb/zdb_display_block.ksh \
 	functional/cli_root/zdb/zdb_encrypted.ksh \
 	functional/cli_root/zdb/zdb_label_checksum.ksh \
 	functional/cli_root/zdb/zdb_object_range_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_pos.ksh \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
 	functional/cli_root/zdb/zdb_recover_2.ksh \
 	functional/cli_root/zdb/zdb_recover.ksh \
 	functional/cli_root/zfs_bookmark/cleanup.ksh \
 	functional/cli_root/zfs_bookmark/setup.ksh \
 	functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \
 	functional/cli_root/zfs_change-key/cleanup.ksh \
 	functional/cli_root/zfs_change-key/setup.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \
 	functional/cli_root/zfs/cleanup.ksh \
 	functional/cli_root/zfs_clone/cleanup.ksh \
 	functional/cli_root/zfs_clone/setup.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \
 	functional/cli_root/zfs_copies/cleanup.ksh \
 	functional/cli_root/zfs_copies/setup.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \
 	functional/cli_root/zfs_create/cleanup.ksh \
 	functional/cli_root/zfs_create/setup.ksh \
 	functional/cli_root/zfs_create/zfs_create_001_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_002_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_003_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_004_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_005_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_006_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_007_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_008_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_009_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_010_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_011_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_012_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_013_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_014_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \
 	functional/cli_root/zfs_create/zfs_create_dryrun.ksh \
 	functional/cli_root/zfs_create/zfs_create_encrypted.ksh \
 	functional/cli_root/zfs_create/zfs_create_nomount.ksh \
 	functional/cli_root/zfs_create/zfs_create_verbose.ksh \
 	functional/cli_root/zfs_destroy/cleanup.ksh \
 	functional/cli_root/zfs_destroy/setup.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \
 	functional/cli_root/zfs_diff/cleanup.ksh \
 	functional/cli_root/zfs_diff/setup.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_changes.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_types.ksh \
 	functional/cli_root/zfs_get/cleanup.ksh \
 	functional/cli_root/zfs_get/setup.ksh \
 	functional/cli_root/zfs_get/zfs_get_001_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_002_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_003_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_004_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_005_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_006_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_007_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_008_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_009_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_010_neg.ksh \
 	functional/cli_root/zfs_ids_to_path/cleanup.ksh \
 	functional/cli_root/zfs_ids_to_path/setup.ksh \
 	functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \
 	functional/cli_root/zfs_inherit/cleanup.ksh \
 	functional/cli_root/zfs_inherit/setup.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \
 	functional/cli_root/zfs_jail/cleanup.ksh \
 	functional/cli_root/zfs_jail/setup.ksh \
 	functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \
 	functional/cli_root/zfs_load-key/cleanup.ksh \
 	functional/cli_root/zfs_load-key/setup.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \
 	functional/cli_root/zfs_mount/cleanup.ksh \
 	functional/cli_root/zfs_mount/setup.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
 	functional/cli_root/zfs_program/cleanup.ksh \
 	functional/cli_root/zfs_program/setup.ksh \
 	functional/cli_root/zfs_program/zfs_program_json.ksh \
 	functional/cli_root/zfs_promote/cleanup.ksh \
 	functional/cli_root/zfs_promote/setup.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \
 	functional/cli_root/zfs_property/cleanup.ksh \
 	functional/cli_root/zfs_property/setup.ksh \
 	functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \
 	functional/cli_root/zfs_receive/cleanup.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \
 	functional/cli_root/zfs_receive/setup.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-e.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \
 	functional/cli_root/zfs_rename/cleanup.ksh \
 	functional/cli_root/zfs_rename/setup.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \
 	functional/cli_root/zfs_reservation/cleanup.ksh \
 	functional/cli_root/zfs_reservation/setup.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \
 	functional/cli_root/zfs_rollback/cleanup.ksh \
 	functional/cli_root/zfs_rollback/setup.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \
 	functional/cli_root/zfs_send/cleanup.ksh \
 	functional/cli_root/zfs_send/setup.ksh \
 	functional/cli_root/zfs_send/zfs_send_001_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_002_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_003_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_004_neg.ksh \
 	functional/cli_root/zfs_send/zfs_send_005_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_006_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_007_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send-b.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \
 	functional/cli_root/zfs_send/zfs_send_raw.ksh \
 	functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \
 	functional/cli_root/zfs_send/zfs_send_sparse.ksh \
 	functional/cli_root/zfs_set/cache_001_pos.ksh \
 	functional/cli_root/zfs_set/cache_002_neg.ksh \
 	functional/cli_root/zfs_set/canmount_001_pos.ksh \
 	functional/cli_root/zfs_set/canmount_002_pos.ksh \
 	functional/cli_root/zfs_set/canmount_003_pos.ksh \
 	functional/cli_root/zfs_set/canmount_004_pos.ksh \
 	functional/cli_root/zfs_set/checksum_001_pos.ksh \
 	functional/cli_root/zfs_set/cleanup.ksh \
 	functional/cli_root/zfs_set/compression_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_002_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_003_pos.ksh \
 	functional/cli_root/zfs_set/onoffs_001_pos.ksh \
 	functional/cli_root/zfs_set/property_alias_001_pos.ksh \
 	functional/cli_root/zfs_set/readonly_001_pos.ksh \
 	functional/cli_root/zfs_set/reservation_001_neg.ksh \
 	functional/cli_root/zfs_set/ro_props_001_pos.ksh \
 	functional/cli_root/zfs_set/setup.ksh \
 	functional/cli_root/zfs_set/share_mount_001_neg.ksh \
 	functional/cli_root/zfs_set/snapdir_001_pos.ksh \
 	functional/cli_root/zfs/setup.ksh \
 	functional/cli_root/zfs_set/user_property_001_pos.ksh \
 	functional/cli_root/zfs_set/user_property_002_pos.ksh \
 	functional/cli_root/zfs_set/user_property_003_neg.ksh \
 	functional/cli_root/zfs_set/user_property_004_pos.ksh \
 	functional/cli_root/zfs_set/version_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_002_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_003_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \
 	functional/cli_root/zfs_set/zfs_set_keylocation.ksh \
 	functional/cli_root/zfs_set/zfs_set_nomount.ksh \
 	functional/cli_root/zfs_share/cleanup.ksh \
 	functional/cli_root/zfs_share/setup.ksh \
 	functional/cli_root/zfs_share/zfs_share_001_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_002_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_003_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_004_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_005_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_006_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_007_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_008_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_009_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_010_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_011_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_012_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_013_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \
 	functional/cli_root/zfs_share/zfs_share_after_mount.ksh \
 	functional/cli_root/zfs_snapshot/cleanup.ksh \
 	functional/cli_root/zfs_snapshot/setup.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \
 	functional/cli_root/zfs_sysfs/cleanup.ksh \
 	functional/cli_root/zfs_sysfs/setup.ksh \
 	functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \
 	functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \
 	functional/cli_root/zfs_unload-key/cleanup.ksh \
 	functional/cli_root/zfs_unload-key/setup.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \
 	functional/cli_root/zfs_unmount/cleanup.ksh \
 	functional/cli_root/zfs_unmount/setup.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \
 	functional/cli_root/zfs_unshare/cleanup.ksh \
 	functional/cli_root/zfs_unshare/setup.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \
 	functional/cli_root/zfs_upgrade/cleanup.ksh \
 	functional/cli_root/zfs_upgrade/setup.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \
 	functional/cli_root/zfs_wait/cleanup.ksh \
 	functional/cli_root/zfs_wait/setup.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \
 	functional/cli_root/zfs/zfs_001_neg.ksh \
 	functional/cli_root/zfs/zfs_002_pos.ksh \
 	functional/cli_root/zfs/zfs_003_neg.ksh \
 	functional/cli_root/zhack/zhack_label_repair_001.ksh \
 	functional/cli_root/zhack/zhack_label_repair_002.ksh \
 	functional/cli_root/zhack/zhack_label_repair_003.ksh \
 	functional/cli_root/zhack/zhack_label_repair_004.ksh \
 	functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \
 	functional/cli_root/zpool_add/add-o_ashift.ksh \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_004_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_005_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_006_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_007_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_008_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_009_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_010_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \
 	functional/cli_root/zpool_attach/attach-o_ashift.ksh \
 	functional/cli_root/zpool_attach/cleanup.ksh \
 	functional/cli_root/zpool_attach/setup.ksh \
 	functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \
 	functional/cli_root/zpool/cleanup.ksh \
 	functional/cli_root/zpool_clear/cleanup.ksh \
 	functional/cli_root/zpool_clear/setup.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \
 	functional/cli_root/zpool_create/cleanup.ksh \
 	functional/cli_root/zpool_create/create-o_ashift.ksh \
 	functional/cli_root/zpool_create/setup.ksh \
 	functional/cli_root/zpool_create/zpool_create_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_007_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_009_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_010_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_011_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_012_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_014_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_015_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_016_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_017_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_018_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_019_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_020_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_021_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_022_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_023_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_024_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \
 	functional/cli_root/zpool_create/zpool_create_encrypted.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_tempname.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \
 	functional/cli_root/zpool_detach/cleanup.ksh \
 	functional/cli_root/zpool_detach/setup.ksh \
 	functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \
 	functional/cli_root/zpool_events/cleanup.ksh \
 	functional/cli_root/zpool_events/setup.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \
 	functional/cli_root/zpool_events/zpool_events_cliargs.ksh \
 	functional/cli_root/zpool_events/zpool_events_duplicates.ksh \
 	functional/cli_root/zpool_events/zpool_events_errors.ksh \
 	functional/cli_root/zpool_events/zpool_events_follow.ksh \
 	functional/cli_root/zpool_events/zpool_events_poolname.ksh \
 	functional/cli_root/zpool_expand/cleanup.ksh \
 	functional/cli_root/zpool_expand/setup.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \
 	functional/cli_root/zpool_export/cleanup.ksh \
 	functional/cli_root/zpool_export/setup.ksh \
 	functional/cli_root/zpool_export/zpool_export_001_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_002_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_003_neg.ksh \
 	functional/cli_root/zpool_export/zpool_export_004_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \
 	functional/cli_root/zpool_get/cleanup.ksh \
 	functional/cli_root/zpool_get/setup.ksh \
 	functional/cli_root/zpool_get/vdev_get_001_pos.ksh \
 	functional/cli_root/zpool_get/vdev_get_all.ksh \
 	functional/cli_root/zpool_get/zpool_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_002_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_003_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_004_neg.ksh \
 	functional/cli_root/zpool_get/zpool_get_005_pos.ksh \
 	functional/cli_root/zpool_history/cleanup.ksh \
 	functional/cli_root/zpool_history/setup.ksh \
 	functional/cli_root/zpool_history/zpool_history_001_neg.ksh \
 	functional/cli_root/zpool_history/zpool_history_002_pos.ksh \
 	functional/cli_root/zpool_import/cleanup.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_added.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \
 	functional/cli_root/zpool_import/import_devices_missing.ksh \
 	functional/cli_root/zpool_import/import_log_missing.ksh \
 	functional/cli_root/zpool_import/import_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_config_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \
 	functional/cli_root/zpool_import/setup.ksh \
 	functional/cli_root/zpool_import/zpool_import_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_004_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_005_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_006_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_007_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_008_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_009_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_010_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_011_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_012_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_013_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_014_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_015_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_016_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_017_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata3.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata4.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_status.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \
 	functional/cli_root/zpool_initialize/cleanup.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \
 	functional/cli_root/zpool_offline/cleanup.ksh \
 	functional/cli_root/zpool_offline/setup.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \
 	functional/cli_root/zpool_online/cleanup.ksh \
 	functional/cli_root/zpool_online/setup.ksh \
 	functional/cli_root/zpool_online/zpool_online_001_pos.ksh \
 	functional/cli_root/zpool_online/zpool_online_002_neg.ksh \
 	functional/cli_root/zpool_prefetch/cleanup.ksh \
 	functional/cli_root/zpool_prefetch/setup.ksh \
 	functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \
 	functional/cli_root/zpool_reguid/cleanup.ksh \
 	functional/cli_root/zpool_reguid/setup.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh \
 	functional/cli_root/zpool_remove/cleanup.ksh \
 	functional/cli_root/zpool_remove/setup.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \
 	functional/cli_root/zpool_reopen/cleanup.ksh \
 	functional/cli_root/zpool_reopen/setup.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \
 	functional/cli_root/zpool_replace/cleanup.ksh \
 	functional/cli_root/zpool_replace/replace-o_ashift.ksh \
 	functional/cli_root/zpool_replace/replace_prop_ashift.ksh \
 	functional/cli_root/zpool_replace/setup.ksh \
 	functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \
 	functional/cli_root/zpool_resilver/cleanup.ksh \
 	functional/cli_root/zpool_resilver/setup.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
 	functional/cli_root/zpool_scrub/cleanup.ksh \
 	functional/cli_root/zpool_scrub/setup.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
 	functional/cli_root/zpool_set/cleanup.ksh \
 	functional/cli_root/zpool_set/setup.ksh \
 	functional/cli_root/zpool/setup.ksh \
 	functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_common.kshlib \
 	functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_003_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_ashift.ksh \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
 	functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \
 	functional/cli_root/zpool_split/cleanup.ksh \
 	functional/cli_root/zpool_split/setup.ksh \
 	functional/cli_root/zpool_split/zpool_split_cliargs.ksh \
 	functional/cli_root/zpool_split/zpool_split_devices.ksh \
 	functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \
 	functional/cli_root/zpool_split/zpool_split_encryption.ksh \
 	functional/cli_root/zpool_split/zpool_split_indirect.ksh \
 	functional/cli_root/zpool_split/zpool_split_props.ksh \
 	functional/cli_root/zpool_split/zpool_split_resilver.ksh \
 	functional/cli_root/zpool_split/zpool_split_vdevs.ksh \
 	functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \
 	functional/cli_root/zpool_status/cleanup.ksh \
 	functional/cli_root/zpool_status/setup.ksh \
 	functional/cli_root/zpool_status/zpool_status_001_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_002_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_004_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \
 	functional/cli_root/zpool_trim/cleanup.ksh \
 	functional/cli_root/zpool_trim/setup.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_partial.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_secure.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_split.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \
 	functional/cli_root/zpool_upgrade/cleanup.ksh \
 	functional/cli_root/zpool_upgrade/setup.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \
 	functional/cli_root/zpool_wait/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/setup.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \
 	functional/cli_root/zpool_wait/setup.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_discard.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_usage.ksh \
 	functional/cli_root/zpool/zpool_001_neg.ksh \
 	functional/cli_root/zpool/zpool_002_pos.ksh \
 	functional/cli_root/zpool/zpool_003_pos.ksh \
 	functional/cli_root/zpool/zpool_colors.ksh \
 	functional/cli_user/misc/arcstat_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_002_neg.ksh \
 	functional/cli_user/misc/zilstat_001_pos.ksh \
 	functional/cli_user/misc/cleanup.ksh \
 	functional/cli_user/misc/setup.ksh \
 	functional/cli_user/misc/zdb_001_neg.ksh \
 	functional/cli_user/misc/zfs_001_neg.ksh \
 	functional/cli_user/misc/zfs_allow_001_neg.ksh \
 	functional/cli_user/misc/zfs_clone_001_neg.ksh \
 	functional/cli_user/misc/zfs_create_001_neg.ksh \
 	functional/cli_user/misc/zfs_destroy_001_neg.ksh \
 	functional/cli_user/misc/zfs_get_001_neg.ksh \
 	functional/cli_user/misc/zfs_inherit_001_neg.ksh \
 	functional/cli_user/misc/zfs_mount_001_neg.ksh \
 	functional/cli_user/misc/zfs_promote_001_neg.ksh \
 	functional/cli_user/misc/zfs_receive_001_neg.ksh \
 	functional/cli_user/misc/zfs_rename_001_neg.ksh \
 	functional/cli_user/misc/zfs_rollback_001_neg.ksh \
 	functional/cli_user/misc/zfs_send_001_neg.ksh \
 	functional/cli_user/misc/zfs_set_001_neg.ksh \
 	functional/cli_user/misc/zfs_share_001_neg.ksh \
 	functional/cli_user/misc/zfs_snapshot_001_neg.ksh \
 	functional/cli_user/misc/zfs_unallow_001_neg.ksh \
 	functional/cli_user/misc/zfs_unmount_001_neg.ksh \
 	functional/cli_user/misc/zfs_unshare_001_neg.ksh \
 	functional/cli_user/misc/zfs_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_001_neg.ksh \
 	functional/cli_user/misc/zpool_add_001_neg.ksh \
 	functional/cli_user/misc/zpool_attach_001_neg.ksh \
 	functional/cli_user/misc/zpool_clear_001_neg.ksh \
 	functional/cli_user/misc/zpool_create_001_neg.ksh \
 	functional/cli_user/misc/zpool_destroy_001_neg.ksh \
 	functional/cli_user/misc/zpool_detach_001_neg.ksh \
 	functional/cli_user/misc/zpool_export_001_neg.ksh \
 	functional/cli_user/misc/zpool_get_001_neg.ksh \
 	functional/cli_user/misc/zpool_history_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_002_neg.ksh \
 	functional/cli_user/misc/zpool_offline_001_neg.ksh \
 	functional/cli_user/misc/zpool_online_001_neg.ksh \
 	functional/cli_user/misc/zpool_remove_001_neg.ksh \
 	functional/cli_user/misc/zpool_replace_001_neg.ksh \
 	functional/cli_user/misc/zpool_scrub_001_neg.ksh \
 	functional/cli_user/misc/zpool_set_001_neg.ksh \
 	functional/cli_user/misc/zpool_status_001_neg.ksh \
 	functional/cli_user/misc/zpool_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_wait_privilege.ksh \
 	functional/cli_user/zfs_list/cleanup.ksh \
 	functional/cli_user/zfs_list/setup.ksh \
 	functional/cli_user/zfs_list/zfs_list_001_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_002_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_003_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_004_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_005_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_007_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_008_neg.ksh \
 	functional/cli_user/zpool_iostat/cleanup.ksh \
 	functional/cli_user/zpool_iostat/setup.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \
 	functional/cli_user/zpool_list/cleanup.ksh \
 	functional/cli_user/zpool_list/setup.ksh \
 	functional/cli_user/zpool_list/zpool_list_001_pos.ksh \
 	functional/cli_user/zpool_list/zpool_list_002_neg.ksh \
 	functional/cli_user/zpool_status/cleanup.ksh \
 	functional/cli_user/zpool_status/setup.ksh \
 	functional/cli_user/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \
 	functional/compression/cleanup.ksh \
 	functional/compression/compress_001_pos.ksh \
 	functional/compression/compress_002_pos.ksh \
 	functional/compression/compress_003_pos.ksh \
 	functional/compression/compress_004_pos.ksh \
 	functional/compression/compress_zstd_bswap.ksh \
 	functional/compression/l2arc_compressed_arc_disabled.ksh \
 	functional/compression/l2arc_compressed_arc.ksh \
 	functional/compression/l2arc_encrypted.ksh \
 	functional/compression/l2arc_encrypted_no_compressed_arc.ksh \
 	functional/compression/setup.ksh \
 	functional/cp_files/cleanup.ksh \
 	functional/cp_files/cp_files_001_pos.ksh \
 	functional/cp_files/cp_files_002_pos.ksh \
 	functional/cp_files/cp_stress.ksh \
 	functional/cp_files/setup.ksh \
 	functional/crtime/cleanup.ksh \
 	functional/crtime/crtime_001_pos.ksh \
 	functional/crtime/setup.ksh \
 	functional/crypto/icp_aes_ccm.ksh \
 	functional/crypto/icp_aes_gcm.ksh \
 	functional/deadman/deadman_ratelimit.ksh \
 	functional/deadman/deadman_sync.ksh \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
 	functional/dedup/dedup_fdt_pacing.ksh \
 	functional/dedup/dedup_legacy_create.ksh \
 	functional/dedup/dedup_legacy_import.ksh \
 	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
 	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_prune.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/dedup/dedup_zap_shrink.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
 	functional/delegate/zfs_allow_001_pos.ksh \
 	functional/delegate/zfs_allow_002_pos.ksh \
 	functional/delegate/zfs_allow_003_pos.ksh \
 	functional/delegate/zfs_allow_004_pos.ksh \
 	functional/delegate/zfs_allow_005_pos.ksh \
 	functional/delegate/zfs_allow_006_pos.ksh \
 	functional/delegate/zfs_allow_007_pos.ksh \
 	functional/delegate/zfs_allow_008_pos.ksh \
 	functional/delegate/zfs_allow_009_neg.ksh \
 	functional/delegate/zfs_allow_010_pos.ksh \
 	functional/delegate/zfs_allow_011_neg.ksh \
 	functional/delegate/zfs_allow_012_neg.ksh \
 	functional/delegate/zfs_unallow_001_pos.ksh \
 	functional/delegate/zfs_unallow_002_pos.ksh \
 	functional/delegate/zfs_unallow_003_pos.ksh \
 	functional/delegate/zfs_unallow_004_pos.ksh \
 	functional/delegate/zfs_unallow_005_pos.ksh \
 	functional/delegate/zfs_unallow_006_pos.ksh \
 	functional/delegate/zfs_unallow_007_neg.ksh \
 	functional/delegate/zfs_unallow_008_neg.ksh \
 	functional/devices/cleanup.ksh \
 	functional/devices/devices_001_pos.ksh \
 	functional/devices/devices_002_neg.ksh \
 	functional/devices/devices_003_pos.ksh \
 	functional/devices/setup.ksh \
 	functional/direct/dio_aligned_block.ksh \
 	functional/direct/dio_async_always.ksh \
 	functional/direct/dio_async_fio_ioengines.ksh \
 	functional/direct/dio_compression.ksh \
 	functional/direct/dio_dedup.ksh \
 	functional/direct/dio_encryption.ksh \
 	functional/direct/dio_grow_block.ksh \
 	functional/direct/dio_loopback_dev.ksh \
 	functional/direct/dio_max_recordsize.ksh \
 	functional/direct/dio_mixed.ksh \
 	functional/direct/dio_mmap.ksh \
 	functional/direct/dio_overwrites.ksh \
 	functional/direct/dio_property.ksh \
 	functional/direct/dio_random.ksh \
 	functional/direct/dio_read_verify.ksh \
 	functional/direct/dio_recordsize.ksh \
 	functional/direct/dio_unaligned_block.ksh \
 	functional/direct/dio_unaligned_filesize.ksh \
 	functional/direct/dio_write_verify.ksh \
 	functional/direct/dio_write_stable_pages.ksh \
 	functional/direct/setup.ksh \
 	functional/direct/cleanup.ksh \
 	functional/dos_attributes/cleanup.ksh \
 	functional/dos_attributes/read_dos_attrs_001.ksh \
 	functional/dos_attributes/setup.ksh \
 	functional/dos_attributes/write_dos_attrs_001.ksh \
 	functional/events/cleanup.ksh \
 	functional/events/events_001_pos.ksh \
 	functional/events/events_002_pos.ksh \
 	functional/events/setup.ksh \
 	functional/events/zed_cksum_config.ksh \
 	functional/events/zed_cksum_reported.ksh \
 	functional/events/zed_diagnose_multiple.ksh \
 	functional/events/zed_fd_spill.ksh \
 	functional/events/zed_io_config.ksh \
 	functional/events/zed_rc_filter.ksh \
 	functional/events/zed_slow_io.ksh \
 	functional/events/zed_slow_io_many_vdevs.ksh \
 	functional/exec/cleanup.ksh \
 	functional/exec/exec_001_pos.ksh \
 	functional/exec/exec_002_neg.ksh \
 	functional/exec/setup.ksh \
 	functional/fadvise/cleanup.ksh \
 	functional/fadvise/fadvise_sequential.ksh \
 	functional/fadvise/setup.ksh \
 	functional/fallocate/cleanup.ksh \
 	functional/fallocate/fallocate_prealloc.ksh \
 	functional/fallocate/fallocate_punch-hole.ksh \
 	functional/fallocate/fallocate_zero-range.ksh \
 	functional/fallocate/setup.ksh \
 	functional/fault/auto_offline_001_pos.ksh \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
 	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
 	functional/fault/auto_spare_double.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
 	functional/fault/decompress_fault.ksh \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/fault_limits.ksh \
 	functional/fault/scrub_after_resilver.ksh \
 	functional/fault/suspend_on_probe_errors.ksh \
 	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
 	functional/features/async_destroy/async_destroy_001_pos.ksh \
 	functional/features/async_destroy/cleanup.ksh \
 	functional/features/async_destroy/setup.ksh \
 	functional/features/large_dnode/cleanup.ksh \
 	functional/features/large_dnode/large_dnode_001_pos.ksh \
 	functional/features/large_dnode/large_dnode_002_pos.ksh \
 	functional/features/large_dnode/large_dnode_003_pos.ksh \
 	functional/features/large_dnode/large_dnode_004_neg.ksh \
 	functional/features/large_dnode/large_dnode_005_pos.ksh \
 	functional/features/large_dnode/large_dnode_006_pos.ksh \
 	functional/features/large_dnode/large_dnode_007_neg.ksh \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
 	functional/gang_blocks/cleanup.ksh \
+	functional/gang_blocks/gang_blocks_001_pos.ksh \
 	functional/gang_blocks/gang_blocks_ddt_copies.ksh \
 	functional/gang_blocks/gang_blocks_redundant.ksh \
 	functional/gang_blocks/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
 	functional/history/history_001_pos.ksh \
 	functional/history/history_002_pos.ksh \
 	functional/history/history_003_pos.ksh \
 	functional/history/history_004_pos.ksh \
 	functional/history/history_005_neg.ksh \
 	functional/history/history_006_neg.ksh \
 	functional/history/history_007_pos.ksh \
 	functional/history/history_008_pos.ksh \
 	functional/history/history_009_pos.ksh \
 	functional/history/history_010_pos.ksh \
 	functional/history/setup.ksh \
 	functional/inheritance/cleanup.ksh \
 	functional/inheritance/inherit_001_pos.ksh \
 	functional/inuse/inuse_001_pos.ksh \
 	functional/inuse/inuse_003_pos.ksh \
 	functional/inuse/inuse_004_pos.ksh \
 	functional/inuse/inuse_005_pos.ksh \
 	functional/inuse/inuse_006_pos.ksh \
 	functional/inuse/inuse_007_pos.ksh \
 	functional/inuse/inuse_008_pos.ksh \
 	functional/inuse/inuse_009_pos.ksh \
 	functional/inuse/setup.ksh \
 	functional/io/cleanup.ksh \
 	functional/io/io_uring.ksh \
 	functional/io/libaio.ksh \
 	functional/io/mmap.ksh \
 	functional/io/posixaio.ksh \
 	functional/io/psync.ksh \
 	functional/io/setup.ksh \
 	functional/io/sync.ksh \
 	functional/l2arc/cleanup.ksh \
 	functional/l2arc/l2arc_arcstats_pos.ksh \
 	functional/l2arc/l2arc_l2miss_pos.ksh \
 	functional/l2arc/l2arc_mfuonly_pos.ksh \
 	functional/l2arc/persist_l2arc_001_pos.ksh \
 	functional/l2arc/persist_l2arc_002_pos.ksh \
 	functional/l2arc/persist_l2arc_003_neg.ksh \
 	functional/l2arc/persist_l2arc_004_pos.ksh \
 	functional/l2arc/persist_l2arc_005_pos.ksh \
 	functional/l2arc/setup.ksh \
 	functional/large_files/cleanup.ksh \
 	functional/large_files/large_files_001_pos.ksh \
 	functional/large_files/large_files_002_pos.ksh \
 	functional/large_files/setup.ksh \
 	functional/largest_pool/largest_pool_001_pos.ksh \
 	functional/libzfs/cleanup.ksh \
 	functional/libzfs/libzfs_input.ksh \
 	functional/libzfs/setup.ksh \
 	functional/limits/cleanup.ksh \
 	functional/limits/filesystem_count.ksh \
 	functional/limits/filesystem_limit.ksh \
 	functional/limits/setup.ksh \
 	functional/limits/snapshot_count.ksh \
 	functional/limits/snapshot_limit.ksh \
 	functional/link_count/cleanup.ksh \
 	functional/link_count/link_count_001.ksh \
 	functional/link_count/link_count_root_inode.ksh \
 	functional/link_count/setup.ksh \
 	functional/longname/cleanup.ksh \
 	functional/longname/longname_001_pos.ksh \
 	functional/longname/longname_002_pos.ksh \
 	functional/longname/longname_003_pos.ksh \
 	functional/longname/setup.ksh \
 	functional/log_spacemap/log_spacemap_import_logs.ksh \
 	functional/migration/cleanup.ksh \
 	functional/migration/migration_001_pos.ksh \
 	functional/migration/migration_002_pos.ksh \
 	functional/migration/migration_003_pos.ksh \
 	functional/migration/migration_004_pos.ksh \
 	functional/migration/migration_005_pos.ksh \
 	functional/migration/migration_006_pos.ksh \
 	functional/migration/migration_007_pos.ksh \
 	functional/migration/migration_008_pos.ksh \
 	functional/migration/migration_009_pos.ksh \
 	functional/migration/migration_010_pos.ksh \
 	functional/migration/migration_011_pos.ksh \
 	functional/migration/migration_012_pos.ksh \
 	functional/migration/setup.ksh \
 	functional/mmap/cleanup.ksh \
 	functional/mmap/mmap_libaio_001_pos.ksh \
 	functional/mmap/mmap_mixed.ksh \
 	functional/mmap/mmap_read_001_pos.ksh \
 	functional/mmap/mmap_seek_001_pos.ksh \
 	functional/mmap/mmap_sync_001_pos.ksh \
 	functional/mmap/mmap_write_001_pos.ksh \
 	functional/mmap/setup.ksh \
 	functional/mmp/cleanup.ksh \
 	functional/mmp/mmp_active_import.ksh \
 	functional/mmp/mmp_exported_import.ksh \
 	functional/mmp/mmp_hostid.ksh \
 	functional/mmp/mmp_inactive_import.ksh \
 	functional/mmp/mmp_interval.ksh \
 	functional/mmp/mmp_on_off.ksh \
 	functional/mmp/mmp_on_thread.ksh \
 	functional/mmp/mmp_on_uberblocks.ksh \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
 	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
 	functional/mount/cleanup.ksh \
 	functional/mount/setup.ksh \
 	functional/mount/umount_001.ksh \
 	functional/mount/umountall_001.ksh \
 	functional/mount/umount_unlinked_drain.ksh \
 	functional/mv_files/cleanup.ksh \
 	functional/mv_files/mv_files_001_pos.ksh \
 	functional/mv_files/mv_files_002_pos.ksh \
 	functional/mv_files/random_creation.ksh \
 	functional/mv_files/setup.ksh \
 	functional/nestedfs/cleanup.ksh \
 	functional/nestedfs/nestedfs_001_pos.ksh \
 	functional/nestedfs/setup.ksh \
 	functional/nopwrite/cleanup.ksh \
 	functional/nopwrite/nopwrite_copies.ksh \
 	functional/nopwrite/nopwrite_mtime.ksh \
 	functional/nopwrite/nopwrite_negative.ksh \
 	functional/nopwrite/nopwrite_promoted_clone.ksh \
 	functional/nopwrite/nopwrite_recsize.ksh \
 	functional/nopwrite/nopwrite_sync.ksh \
 	functional/nopwrite/nopwrite_varying_compression.ksh \
 	functional/nopwrite/nopwrite_volume.ksh \
 	functional/nopwrite/setup.ksh \
 	functional/no_space/cleanup.ksh \
 	functional/no_space/enospc_001_pos.ksh \
 	functional/no_space/enospc_002_pos.ksh \
 	functional/no_space/enospc_003_pos.ksh \
 	functional/no_space/enospc_df.ksh \
 	functional/no_space/enospc_ganging.ksh \
 	functional/no_space/enospc_rm.ksh \
 	functional/no_space/setup.ksh \
 	functional/online_offline/cleanup.ksh \
 	functional/online_offline/online_offline_001_pos.ksh \
 	functional/online_offline/online_offline_002_neg.ksh \
 	functional/online_offline/online_offline_003_neg.ksh \
 	functional/online_offline/setup.ksh \
 	functional/pam/cleanup.ksh \
 	functional/pam/pam_basic.ksh \
 	functional/pam/pam_change_unmounted.ksh \
 	functional/pam/pam_mount_recursively.ksh \
 	functional/pam/pam_nounmount.ksh \
 	functional/pam/pam_recursive.ksh \
 	functional/pam/pam_short_password.ksh \
 	functional/pam/setup.ksh \
 	functional/pool_checkpoint/checkpoint_after_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_big_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_capacity.ksh \
 	functional/pool_checkpoint/checkpoint_conf_change.ksh \
 	functional/pool_checkpoint/checkpoint_discard_busy.ksh \
 	functional/pool_checkpoint/checkpoint_discard.ksh \
 	functional/pool_checkpoint/checkpoint_discard_many.ksh \
 	functional/pool_checkpoint/checkpoint_indirect.ksh \
 	functional/pool_checkpoint/checkpoint_invalid.ksh \
 	functional/pool_checkpoint/checkpoint_lun_expsz.ksh \
 	functional/pool_checkpoint/checkpoint_open.ksh \
 	functional/pool_checkpoint/checkpoint_removal.ksh \
 	functional/pool_checkpoint/checkpoint_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_ro_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_sm_scale.ksh \
 	functional/pool_checkpoint/checkpoint_twice.ksh \
 	functional/pool_checkpoint/checkpoint_vdev_add.ksh \
 	functional/pool_checkpoint/checkpoint_zdb.ksh \
 	functional/pool_checkpoint/checkpoint_zhack_feat.ksh \
 	functional/pool_checkpoint/cleanup.ksh \
 	functional/pool_checkpoint/setup.ksh \
 	functional/pool_names/pool_names_001_pos.ksh \
 	functional/pool_names/pool_names_002_neg.ksh \
 	functional/poolversion/cleanup.ksh \
 	functional/poolversion/poolversion_001_pos.ksh \
 	functional/poolversion/poolversion_002_pos.ksh \
 	functional/poolversion/setup.ksh \
 	functional/privilege/cleanup.ksh \
 	functional/privilege/privilege_001_pos.ksh \
 	functional/privilege/privilege_002_pos.ksh \
 	functional/privilege/setup.ksh \
 	functional/procfs/cleanup.ksh \
 	functional/procfs/pool_state.ksh \
 	functional/procfs/procfs_list_basic.ksh \
 	functional/procfs/procfs_list_concurrent_readers.ksh \
 	functional/procfs/procfs_list_stale_read.ksh \
 	functional/procfs/setup.ksh \
 	functional/projectquota/cleanup.ksh \
 	functional/projectquota/projectid_001_pos.ksh \
 	functional/projectquota/projectid_002_pos.ksh \
 	functional/projectquota/projectid_003_pos.ksh \
 	functional/projectquota/projectquota_001_pos.ksh \
 	functional/projectquota/projectquota_002_pos.ksh \
 	functional/projectquota/projectquota_003_pos.ksh \
 	functional/projectquota/projectquota_004_neg.ksh \
 	functional/projectquota/projectquota_005_pos.ksh \
 	functional/projectquota/projectquota_006_pos.ksh \
 	functional/projectquota/projectquota_007_pos.ksh \
 	functional/projectquota/projectquota_008_pos.ksh \
 	functional/projectquota/projectquota_009_pos.ksh \
 	functional/projectquota/defaultprojectquota_001_pos.ksh \
 	functional/projectquota/defaultprojectquota_002_pos.ksh \
 	functional/projectquota/defaultprojectquota_003_neg.ksh \
 	functional/projectquota/defaultprojectquota_004_pos.ksh \
 	functional/projectquota/defaultprojectquota_005_pos.ksh \
 	functional/projectquota/defaultprojectquota_006_pos.ksh \
 	functional/projectquota/defaultprojectquota_007_pos.ksh \
 	functional/projectquota/projectspace_001_pos.ksh \
 	functional/projectquota/projectspace_002_pos.ksh \
 	functional/projectquota/projectspace_003_pos.ksh \
 	functional/projectquota/projectspace_004_pos.ksh \
 	functional/projectquota/projectspace_005_pos.ksh \
 	functional/projectquota/projecttree_001_pos.ksh \
 	functional/projectquota/projecttree_002_pos.ksh \
 	functional/projectquota/projecttree_003_neg.ksh \
 	functional/projectquota/setup.ksh \
 	functional/quota/cleanup.ksh \
 	functional/quota/quota_001_pos.ksh \
 	functional/quota/quota_002_pos.ksh \
 	functional/quota/quota_003_pos.ksh \
 	functional/quota/quota_004_pos.ksh \
 	functional/quota/quota_005_pos.ksh \
 	functional/quota/quota_006_neg.ksh \
 	functional/quota/setup.ksh \
 	functional/raidz/cleanup.ksh \
 	functional/raidz/raidz_001_neg.ksh \
 	functional/raidz/raidz_002_pos.ksh \
 	functional/raidz/raidz_expand_001_pos.ksh \
 	functional/raidz/raidz_expand_002_pos.ksh \
 	functional/raidz/raidz_expand_003_neg.ksh \
 	functional/raidz/raidz_expand_003_pos.ksh \
 	functional/raidz/raidz_expand_004_pos.ksh \
 	functional/raidz/raidz_expand_005_pos.ksh \
 	functional/raidz/raidz_expand_006_neg.ksh \
 	functional/raidz/raidz_expand_007_neg.ksh \
 	functional/raidz/setup.ksh \
 	functional/redacted_send/cleanup.ksh \
 	functional/redacted_send/redacted_compressed.ksh \
 	functional/redacted_send/redacted_contents.ksh \
 	functional/redacted_send/redacted_deleted.ksh \
 	functional/redacted_send/redacted_disabled_feature.ksh \
 	functional/redacted_send/redacted_embedded.ksh \
 	functional/redacted_send/redacted_holes.ksh \
 	functional/redacted_send/redacted_incrementals.ksh \
 	functional/redacted_send/redacted_largeblocks.ksh \
 	functional/redacted_send/redacted_many_clones.ksh \
 	functional/redacted_send/redacted_mixed_recsize.ksh \
 	functional/redacted_send/redacted_mounts.ksh \
 	functional/redacted_send/redacted_negative.ksh \
 	functional/redacted_send/redacted_origin.ksh \
 	functional/redacted_send/redacted_panic.ksh \
 	functional/redacted_send/redacted_props.ksh \
 	functional/redacted_send/redacted_resume.ksh \
 	functional/redacted_send/redacted_size.ksh \
 	functional/redacted_send/redacted_volume.ksh \
 	functional/redacted_send/setup.ksh \
 	functional/redundancy/cleanup.ksh \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
 	functional/redundancy/redundancy_draid_spare3.ksh \
 	functional/redundancy/redundancy_mirror.ksh \
 	functional/redundancy/redundancy_raidz1.ksh \
 	functional/redundancy/redundancy_raidz2.ksh \
 	functional/redundancy/redundancy_raidz3.ksh \
 	functional/redundancy/redundancy_raidz.ksh \
 	functional/redundancy/redundancy_stripe.ksh \
 	functional/redundancy/setup.ksh \
 	functional/refquota/cleanup.ksh \
 	functional/refquota/refquota_001_pos.ksh \
 	functional/refquota/refquota_002_pos.ksh \
 	functional/refquota/refquota_003_pos.ksh \
 	functional/refquota/refquota_004_pos.ksh \
 	functional/refquota/refquota_005_pos.ksh \
 	functional/refquota/refquota_006_neg.ksh \
 	functional/refquota/refquota_007_neg.ksh \
 	functional/refquota/refquota_008_neg.ksh \
 	functional/refquota/setup.ksh \
 	functional/refreserv/cleanup.ksh \
 	functional/refreserv/refreserv_001_pos.ksh \
 	functional/refreserv/refreserv_002_pos.ksh \
 	functional/refreserv/refreserv_003_pos.ksh \
 	functional/refreserv/refreserv_004_pos.ksh \
 	functional/refreserv/refreserv_005_pos.ksh \
 	functional/refreserv/refreserv_multi_raidz.ksh \
 	functional/refreserv/refreserv_raidz.ksh \
 	functional/refreserv/setup.ksh \
 	functional/removal/cleanup.ksh \
 	functional/removal/removal_all_vdev.ksh \
 	functional/removal/removal_cancel.ksh \
 	functional/removal/removal_check_space.ksh \
 	functional/removal/removal_condense_export.ksh \
 	functional/removal/removal_multiple_indirection.ksh \
 	functional/removal/removal_nopwrite.ksh \
 	functional/removal/removal_remap_deadlists.ksh \
 	functional/removal/removal_reservation.ksh \
 	functional/removal/removal_resume_export.ksh \
 	functional/removal/removal_sanity.ksh \
 	functional/removal/removal_with_add.ksh \
 	functional/removal/removal_with_create_fs.ksh \
 	functional/removal/removal_with_dedup.ksh \
 	functional/removal/removal_with_errors.ksh \
 	functional/removal/removal_with_export.ksh \
 	functional/removal/removal_with_faulted.ksh \
 	functional/removal/removal_with_ganging.ksh \
 	functional/removal/removal_with_hole.ksh \
 	functional/removal/removal_with_indirect.ksh \
 	functional/removal/removal_with_remove.ksh \
 	functional/removal/removal_with_scrub.ksh \
 	functional/removal/removal_with_send.ksh \
 	functional/removal/removal_with_send_recv.ksh \
 	functional/removal/removal_with_snapshot.ksh \
 	functional/removal/removal_with_write.ksh \
 	functional/removal/removal_with_zdb.ksh \
 	functional/removal/remove_attach_mirror.ksh \
 	functional/removal/remove_expanded.ksh \
 	functional/removal/remove_indirect.ksh \
 	functional/removal/remove_mirror.ksh \
 	functional/removal/remove_mirror_sanity.ksh \
 	functional/removal/remove_raidz.ksh \
 	functional/rename_dirs/cleanup.ksh \
 	functional/rename_dirs/rename_dirs_001_pos.ksh \
 	functional/rename_dirs/setup.ksh \
 	functional/renameat2/cleanup.ksh \
 	functional/renameat2/setup.ksh \
 	functional/renameat2/renameat2_exchange.ksh \
 	functional/renameat2/renameat2_noreplace.ksh \
 	functional/renameat2/renameat2_whiteout.ksh \
 	functional/replacement/attach_import.ksh \
 	functional/replacement/attach_multiple.ksh \
 	functional/replacement/attach_rebuild.ksh \
 	functional/replacement/attach_resilver.ksh \
 	functional/replacement/cleanup.ksh \
 	functional/replacement/detach.ksh \
 	functional/replacement/rebuild_disabled_feature.ksh \
 	functional/replacement/rebuild_multiple.ksh \
 	functional/replacement/rebuild_raidz.ksh \
 	functional/replacement/replace_import.ksh \
 	functional/replacement/replace_rebuild.ksh \
 	functional/replacement/replace_resilver.ksh \
 	functional/replacement/resilver_restart_001.ksh \
 	functional/replacement/resilver_restart_002.ksh \
 	functional/replacement/scrub_cancel.ksh \
 	functional/replacement/setup.ksh \
 	functional/reservation/cleanup.ksh \
 	functional/reservation/reservation_001_pos.ksh \
 	functional/reservation/reservation_002_pos.ksh \
 	functional/reservation/reservation_003_pos.ksh \
 	functional/reservation/reservation_004_pos.ksh \
 	functional/reservation/reservation_005_pos.ksh \
 	functional/reservation/reservation_006_pos.ksh \
 	functional/reservation/reservation_007_pos.ksh \
 	functional/reservation/reservation_008_pos.ksh \
 	functional/reservation/reservation_009_pos.ksh \
 	functional/reservation/reservation_010_pos.ksh \
 	functional/reservation/reservation_011_pos.ksh \
 	functional/reservation/reservation_012_pos.ksh \
 	functional/reservation/reservation_013_pos.ksh \
 	functional/reservation/reservation_014_pos.ksh \
 	functional/reservation/reservation_015_pos.ksh \
 	functional/reservation/reservation_016_pos.ksh \
 	functional/reservation/reservation_017_pos.ksh \
 	functional/reservation/reservation_018_pos.ksh \
 	functional/reservation/reservation_019_pos.ksh \
 	functional/reservation/reservation_020_pos.ksh \
 	functional/reservation/reservation_021_neg.ksh \
 	functional/reservation/reservation_022_pos.ksh \
 	functional/reservation/setup.ksh \
 	functional/rootpool/cleanup.ksh \
 	functional/rootpool/rootpool_002_neg.ksh \
 	functional/rootpool/rootpool_003_neg.ksh \
 	functional/rootpool/rootpool_007_pos.ksh \
 	functional/rootpool/setup.ksh \
 	functional/rsend/cleanup.ksh \
 	functional/rsend/recv_dedup_encrypted_zvol.ksh \
 	functional/rsend/recv_dedup.ksh \
 	functional/rsend/rsend_001_pos.ksh \
 	functional/rsend/rsend_002_pos.ksh \
 	functional/rsend/rsend_003_pos.ksh \
 	functional/rsend/rsend_004_pos.ksh \
 	functional/rsend/rsend_005_pos.ksh \
 	functional/rsend/rsend_006_pos.ksh \
 	functional/rsend/rsend_007_pos.ksh \
 	functional/rsend/rsend_008_pos.ksh \
 	functional/rsend/rsend_009_pos.ksh \
 	functional/rsend/rsend_010_pos.ksh \
 	functional/rsend/rsend_011_pos.ksh \
 	functional/rsend/rsend_012_pos.ksh \
 	functional/rsend/rsend_013_pos.ksh \
 	functional/rsend/rsend_014_pos.ksh \
 	functional/rsend/rsend_016_neg.ksh \
 	functional/rsend/rsend_019_pos.ksh \
 	functional/rsend/rsend_020_pos.ksh \
 	functional/rsend/rsend_021_pos.ksh \
 	functional/rsend/rsend_022_pos.ksh \
 	functional/rsend/rsend_024_pos.ksh \
 	functional/rsend/rsend_025_pos.ksh \
 	functional/rsend/rsend_026_neg.ksh \
 	functional/rsend/rsend_027_pos.ksh \
 	functional/rsend/rsend_028_neg.ksh \
 	functional/rsend/rsend_029_neg.ksh \
 	functional/rsend/rsend_030_pos.ksh \
 	functional/rsend/rsend_031_pos.ksh \
 	functional/rsend/send-c_embedded_blocks.ksh \
 	functional/rsend/send-c_incremental.ksh \
 	functional/rsend/send-c_longname.ksh \
 	functional/rsend/send-c_lz4_disabled.ksh \
 	functional/rsend/send-c_mixed_compression.ksh \
 	functional/rsend/send-c_props.ksh \
 	functional/rsend/send-c_recv_dedup.ksh \
 	functional/rsend/send-c_recv_lz4_disabled.ksh \
 	functional/rsend/send-c_resume.ksh \
 	functional/rsend/send-c_stream_size_estimate.ksh \
 	functional/rsend/send-c_verify_contents.ksh \
 	functional/rsend/send-c_verify_ratio.ksh \
 	functional/rsend/send-c_volume.ksh \
 	functional/rsend/send-c_zstream_recompress.ksh \
 	functional/rsend/send-c_zstreamdump.ksh \
 	functional/rsend/send-cpL_varied_recsize.ksh \
 	functional/rsend/send_doall.ksh \
 	functional/rsend/send_encrypted_incremental.ksh \
 	functional/rsend/send_encrypted_files.ksh \
 	functional/rsend/send_encrypted_freeobjects.ksh \
 	functional/rsend/send_encrypted_hierarchy.ksh \
 	functional/rsend/send_encrypted_props.ksh \
 	functional/rsend/send_encrypted_truncated_files.ksh \
 	functional/rsend/send_freeobjects.ksh \
 	functional/rsend/send_holds.ksh \
 	functional/rsend/send_hole_birth.ksh \
 	functional/rsend/send_invalid.ksh \
 	functional/rsend/send-L_toggle.ksh \
 	functional/rsend/send_mixed_raw.ksh \
 	functional/rsend/send_partial_dataset.ksh \
 	functional/rsend/send_raw_ashift.ksh \
 	functional/rsend/send_raw_spill_block.ksh \
 	functional/rsend/send_raw_large_blocks.ksh \
 	functional/rsend/send_realloc_dnode_size.ksh \
 	functional/rsend/send_realloc_encrypted_files.ksh \
 	functional/rsend/send_realloc_files.ksh \
 	functional/rsend/send_spill_block.ksh \
 	functional/rsend/send-wR_encrypted_zvol.ksh \
 	functional/rsend/setup.ksh \
 	functional/scrub_mirror/cleanup.ksh \
 	functional/scrub_mirror/scrub_mirror_001_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_002_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_003_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_004_pos.ksh \
 	functional/scrub_mirror/setup.ksh \
 	functional/slog/cleanup.ksh \
 	functional/slog/setup.ksh \
 	functional/slog/slog_001_pos.ksh \
 	functional/slog/slog_002_pos.ksh \
 	functional/slog/slog_003_pos.ksh \
 	functional/slog/slog_004_pos.ksh \
 	functional/slog/slog_005_pos.ksh \
 	functional/slog/slog_006_pos.ksh \
 	functional/slog/slog_007_pos.ksh \
 	functional/slog/slog_008_neg.ksh \
 	functional/slog/slog_009_neg.ksh \
 	functional/slog/slog_010_neg.ksh \
 	functional/slog/slog_011_neg.ksh \
 	functional/slog/slog_012_neg.ksh \
 	functional/slog/slog_013_pos.ksh \
 	functional/slog/slog_014_pos.ksh \
 	functional/slog/slog_015_neg.ksh \
 	functional/slog/slog_016_pos.ksh \
 	functional/slog/slog_replay_fs_001.ksh \
 	functional/slog/slog_replay_fs_002.ksh \
 	functional/slog/slog_replay_volume.ksh \
 	functional/snapshot/cleanup.ksh \
 	functional/snapshot/clone_001_pos.ksh \
 	functional/snapshot/rollback_001_pos.ksh \
 	functional/snapshot/rollback_002_pos.ksh \
 	functional/snapshot/rollback_003_pos.ksh \
 	functional/snapshot/setup.ksh \
 	functional/snapshot/snapshot_001_pos.ksh \
 	functional/snapshot/snapshot_002_pos.ksh \
 	functional/snapshot/snapshot_003_pos.ksh \
 	functional/snapshot/snapshot_004_pos.ksh \
 	functional/snapshot/snapshot_005_pos.ksh \
 	functional/snapshot/snapshot_006_pos.ksh \
 	functional/snapshot/snapshot_007_pos.ksh \
 	functional/snapshot/snapshot_008_pos.ksh \
 	functional/snapshot/snapshot_009_pos.ksh \
 	functional/snapshot/snapshot_010_pos.ksh \
 	functional/snapshot/snapshot_011_pos.ksh \
 	functional/snapshot/snapshot_012_pos.ksh \
 	functional/snapshot/snapshot_013_pos.ksh \
 	functional/snapshot/snapshot_014_pos.ksh \
 	functional/snapshot/snapshot_015_pos.ksh \
 	functional/snapshot/snapshot_016_pos.ksh \
 	functional/snapshot/snapshot_017_pos.ksh \
 	functional/snapshot/snapshot_018_pos.ksh \
 	functional/snapused/cleanup.ksh \
 	functional/snapused/setup.ksh \
 	functional/snapused/snapused_001_pos.ksh \
 	functional/snapused/snapused_002_pos.ksh \
 	functional/snapused/snapused_003_pos.ksh \
 	functional/snapused/snapused_004_pos.ksh \
 	functional/snapused/snapused_005_pos.ksh \
 	functional/sparse/cleanup.ksh \
 	functional/sparse/setup.ksh \
 	functional/sparse/sparse_001_pos.ksh \
 	functional/stat/cleanup.ksh \
 	functional/stat/setup.ksh \
 	functional/stat/stat_001_pos.ksh \
 	functional/stat/statx_dioalign.ksh \
 	functional/suid/cleanup.ksh \
 	functional/suid/setup.ksh \
 	functional/suid/suid_write_to_none.ksh \
 	functional/suid/suid_write_to_sgid.ksh \
 	functional/suid/suid_write_to_suid.ksh \
 	functional/suid/suid_write_to_suid_sgid.ksh \
 	functional/suid/suid_write_zil_replay.ksh \
 	functional/trim/autotrim_config.ksh \
 	functional/trim/autotrim_integrity.ksh \
 	functional/trim/autotrim_trim_integrity.ksh \
 	functional/trim/cleanup.ksh \
 	functional/trim/setup.ksh \
 	functional/trim/trim_config.ksh \
 	functional/trim/trim_integrity.ksh \
 	functional/trim/trim_l2arc.ksh \
 	functional/truncate/cleanup.ksh \
 	functional/truncate/setup.ksh \
 	functional/truncate/truncate_001_pos.ksh \
 	functional/truncate/truncate_002_pos.ksh \
 	functional/truncate/truncate_timestamps.ksh \
 	functional/upgrade/cleanup.ksh \
 	functional/upgrade/setup.ksh \
 	functional/upgrade/upgrade_projectquota_001_pos.ksh \
 	functional/upgrade/upgrade_projectquota_002_pos.ksh \
 	functional/upgrade/upgrade_readonly_pool.ksh \
 	functional/upgrade/upgrade_userobj_001_pos.ksh \
 	functional/user_namespace/cleanup.ksh \
 	functional/user_namespace/setup.ksh \
 	functional/user_namespace/user_namespace_001.ksh \
 	functional/user_namespace/user_namespace_002.ksh \
 	functional/user_namespace/user_namespace_003.ksh \
 	functional/user_namespace/user_namespace_004.ksh \
 	functional/userquota/cleanup.ksh \
 	functional/userquota/groupspace_001_pos.ksh \
 	functional/userquota/groupspace_002_pos.ksh \
 	functional/userquota/groupspace_003_pos.ksh \
 	functional/userquota/groupspace_004_pos.ksh \
 	functional/userquota/setup.ksh \
 	functional/userquota/defaultuserquota_001_pos.ksh \
 	functional/userquota/defaultuserquota_002_pos.ksh \
 	functional/userquota/defaultuserquota_003_pos.ksh \
 	functional/userquota/defaultuserquota_004_neg.ksh \
 	functional/userquota/defaultuserquota_005_pos.ksh \
 	functional/userquota/defaultuserquota_006_pos.ksh \
 	functional/userquota/defaultuserquota_007_pos.ksh \
 	functional/userquota/defaultuserquota_008_pos.ksh \
 	functional/userquota/defaultuserquota_009_pos.ksh \
 	functional/userquota/defaultuserquota_010_neg.ksh \
 	functional/userquota/defaultuserquota_011_neg.ksh \
 	functional/userquota/defaultuserquota_012_neg.ksh \
 	functional/userquota/defaultuserquota_013_neg.ksh \
 	functional/userquota/userquota_001_pos.ksh \
 	functional/userquota/userquota_002_pos.ksh \
 	functional/userquota/userquota_003_pos.ksh \
 	functional/userquota/userquota_004_pos.ksh \
 	functional/userquota/userquota_005_neg.ksh \
 	functional/userquota/userquota_006_pos.ksh \
 	functional/userquota/userquota_007_pos.ksh \
 	functional/userquota/userquota_008_pos.ksh \
 	functional/userquota/userquota_009_pos.ksh \
 	functional/userquota/userquota_010_pos.ksh \
 	functional/userquota/userquota_011_pos.ksh \
 	functional/userquota/userquota_012_neg.ksh \
 	functional/userquota/userquota_013_pos.ksh \
 	functional/userquota/userspace_001_pos.ksh \
 	functional/userquota/userspace_002_pos.ksh \
 	functional/userquota/userspace_003_pos.ksh \
 	functional/userquota/userspace_004_pos.ksh \
 	functional/userquota/userspace_encrypted.ksh \
 	functional/userquota/userspace_send_encrypted.ksh \
 	functional/userquota/userspace_encrypted_13709.ksh \
 	functional/vdev_zaps/cleanup.ksh \
 	functional/vdev_zaps/setup.ksh \
 	functional/vdev_zaps/vdev_zaps_001_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_002_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_003_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_004_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_005_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_006_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_007_pos.ksh \
 	functional/write_dirs/cleanup.ksh \
 	functional/write_dirs/setup.ksh \
 	functional/write_dirs/write_dirs_001_pos.ksh \
 	functional/write_dirs/write_dirs_002_pos.ksh \
 	functional/xattr/cleanup.ksh \
 	functional/xattr/setup.ksh \
 	functional/xattr/xattr_001_pos.ksh \
 	functional/xattr/xattr_002_neg.ksh \
 	functional/xattr/xattr_003_neg.ksh \
 	functional/xattr/xattr_004_pos.ksh \
 	functional/xattr/xattr_005_pos.ksh \
 	functional/xattr/xattr_006_pos.ksh \
 	functional/xattr/xattr_007_neg.ksh \
 	functional/xattr/xattr_008_pos.ksh \
 	functional/xattr/xattr_009_neg.ksh \
 	functional/xattr/xattr_010_neg.ksh \
 	functional/xattr/xattr_011_pos.ksh \
 	functional/xattr/xattr_012_pos.ksh \
 	functional/xattr/xattr_013_pos.ksh \
 	functional/xattr/xattr_compat.ksh \
 	functional/zap_shrink/cleanup.ksh \
 	functional/zap_shrink/zap_shrink_001_pos.ksh \
 	functional/zap_shrink/setup.ksh \
 	functional/zpool_influxdb/cleanup.ksh \
 	functional/zpool_influxdb/setup.ksh \
 	functional/zpool_influxdb/zpool_influxdb.ksh \
 	functional/zvol/zvol_cli/cleanup.ksh \
 	functional/zvol/zvol_cli/setup.ksh \
 	functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \
 	functional/zvol/zvol_ENOSPC/cleanup.ksh \
 	functional/zvol/zvol_ENOSPC/setup.ksh \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \
 	functional/zvol/zvol_misc/cleanup.ksh \
 	functional/zvol/zvol_misc/setup.ksh \
 	functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_fua.ksh \
 	functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
 	functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
 	functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
 	functional/zvol/zvol_misc/zvol_misc_trim.ksh \
 	functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
 	functional/zvol/zvol_misc/zvol_misc_zil.ksh \
 	functional/zvol/zvol_stress/cleanup.ksh \
 	functional/zvol/zvol_stress/setup.ksh \
 	functional/zvol/zvol_stress/zvol_stress.ksh \
 	functional/zvol/zvol_swap/cleanup.ksh \
 	functional/zvol/zvol_swap/setup.ksh \
 	functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \
 	functional/idmap_mount/cleanup.ksh \
 	functional/idmap_mount/setup.ksh \
 	functional/idmap_mount/idmap_mount_001.ksh \
 	functional/idmap_mount/idmap_mount_002.ksh \
 	functional/idmap_mount/idmap_mount_003.ksh \
 	functional/idmap_mount/idmap_mount_004.ksh \
 	functional/idmap_mount/idmap_mount_005.ksh
diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh
new file mode 100755
index 000000000000..3601f5422250
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_001_pos.ksh
@@ -0,0 +1,59 @@
+#!/bin/ksh
+# SPDX-License-Identifier: CDDL-1.0
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2025 by Klara Inc.
+#
+
+#
+# Description:
+# Verify that gang block functionality behaves correctly.
+#
+# Strategy:
+# 1. Create a pool without dynamic gang headers.
+# 2. Set metaslab_force_ganging to force gang blocks to be created.
+# 3. Verify that gang blocks can be read, written, and freed.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib
+
+log_assert "Gang blocks behave correctly."
+
+preamble
+log_onexit cleanup
+
+log_must zpool create -f $TESTPOOL $DISKS
+log_must zfs create -o recordsize=128k $TESTPOOL/$TESTFS
+mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+set_tunable64 METASLAB_FORCE_GANGING 100000
+set_tunable32 METASLAB_FORCE_GANGING_PCT 100
+
+path="${mountpoint}/file"
+log_must dd if=/dev/urandom of=$path bs=128k count=1
+log_must zpool sync $TESTPOOL
+first_block=$(get_first_block_dva $TESTPOOL/$TESTFS file)
+leaves=$(read_gang_header $TESTPOOL $first_block 200 | grep -v hole | wc -l)
+[[ "$leaves" -gt 1 ]] || log_fail "Only one leaf in gang block, should not be possible"
+
+orig_checksum="$(cat $path | xxh128digest)"
+
+log_must verify_pool $TESTPOOL
+log_must zinject -a
+new_checksum="$(cat $path | xxh128digest)"
+[[ "$orig_checksum" == "$new_checksum" ]] || log_fail "Checksum mismatch"
+
+log_must rm $path
+log_must verify_pool $TESTPOOL
+
+log_pass "Gang blocks behave correctly."